thailang4r 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +201 -20
- data/README.md +23 -0
- data/lib/thailang4r/roman.rb +169 -0
- data/lib/thailang4r/roman.rb~ +3 -0
- data/lib/thailang4r/word_breaker.rb +245 -20
- metadata +22 -40
- data/lib/thailang4r/dict.rb +0 -89
- data/lib/thailang4r/ranges_builder.rb +0 -90
- data/lib/thailang4r/word_dag_builder.rb +0 -78
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e1af7a4c89e108e784964062d2381ae2b2b81715eb5c0f2da4e6c5aa9604426b
|
4
|
+
data.tar.gz: 85aaded96eea07d8369007f76adac54abff69fc246ae798da78da1d6ffa51887
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fc738de378f475f7eafa0439176293cd0b524cc3dd8cc246421bb7dc3b444b57f9e12ebdcfffebd56ce3cbdeb3226f72b4909674ae5438b942b040304c600799
|
7
|
+
data.tar.gz: 53aab1f982c0aeadc812e2e2618a1651f1e620dd745d54c2b42195bc3ffd8c60152f3f8fe6be08d73fb95f95bd7f3dff10edc0e91c50a5e2bc12bc3d3589473b
|
data/LICENSE
CHANGED
@@ -1,20 +1,201 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
the
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
1
|
+
Apache License
|
2
|
+
Version 2.0, January 2004
|
3
|
+
http://www.apache.org/licenses/
|
4
|
+
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6
|
+
|
7
|
+
1. Definitions.
|
8
|
+
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
11
|
+
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13
|
+
the copyright owner that is granting the License.
|
14
|
+
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
16
|
+
other entities that control, are controlled by, or are under common
|
17
|
+
control with that entity. For the purposes of this definition,
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
19
|
+
direction or management of such entity, whether by contract or
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22
|
+
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24
|
+
exercising permissions granted by this License.
|
25
|
+
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
27
|
+
including but not limited to software source code, documentation
|
28
|
+
source, and configuration files.
|
29
|
+
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
31
|
+
transformation or translation of a Source form, including but
|
32
|
+
not limited to compiled object code, generated documentation,
|
33
|
+
and conversions to other media types.
|
34
|
+
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
36
|
+
Object form, made available under the License, as indicated by a
|
37
|
+
copyright notice that is included in or attached to the work
|
38
|
+
(an example is provided in the Appendix below).
|
39
|
+
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46
|
+
the Work and Derivative Works thereof.
|
47
|
+
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
49
|
+
the original version of the Work and any modifications or additions
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
61
|
+
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
64
|
+
subsequently incorporated within the Work.
|
65
|
+
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
72
|
+
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78
|
+
where such license applies only to those patent claims licensable
|
79
|
+
by such Contributor that are necessarily infringed by their
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
82
|
+
institute patent litigation against any entity (including a
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
85
|
+
or contributory patent infringement, then any patent licenses
|
86
|
+
granted to You under this License for that Work shall terminate
|
87
|
+
as of the date such litigation is filed.
|
88
|
+
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
91
|
+
modifications, and in Source or Object form, provided that You
|
92
|
+
meet the following conditions:
|
93
|
+
|
94
|
+
(a) You must give any other recipients of the Work or
|
95
|
+
Derivative Works a copy of this License; and
|
96
|
+
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
98
|
+
stating that You changed the files; and
|
99
|
+
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
102
|
+
attribution notices from the Source form of the Work,
|
103
|
+
excluding those notices that do not pertain to any part of
|
104
|
+
the Derivative Works; and
|
105
|
+
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
108
|
+
include a readable copy of the attribution notices contained
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
111
|
+
of the following places: within a NOTICE text file distributed
|
112
|
+
as part of the Derivative Works; within the Source form or
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
114
|
+
within a display generated by the Derivative Works, if and
|
115
|
+
wherever such third-party notices normally appear. The contents
|
116
|
+
of the NOTICE file are for informational purposes only and
|
117
|
+
do not modify the License. You may add Your own attribution
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
120
|
+
that such additional attribution notices cannot be construed
|
121
|
+
as modifying the License.
|
122
|
+
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
124
|
+
may provide additional or different license terms and conditions
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
128
|
+
the conditions stated in this License.
|
129
|
+
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
133
|
+
this License, without any additional terms or conditions.
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135
|
+
the terms of any separate license agreement you may have executed
|
136
|
+
with Licensor regarding such Contributions.
|
137
|
+
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
140
|
+
except as required for reasonable and customary use in describing the
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
142
|
+
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
152
|
+
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
158
|
+
incidental, or consequential damages of any character arising as a
|
159
|
+
result of this License or out of the use or inability to use the
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
162
|
+
other commercial damages or losses), even if such Contributor
|
163
|
+
has been advised of the possibility of such damages.
|
164
|
+
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168
|
+
or other liability obligations and/or rights consistent with this
|
169
|
+
License. However, in accepting such obligations, You may act only
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
174
|
+
of your accepting any such warranty or additional liability.
|
175
|
+
|
176
|
+
END OF TERMS AND CONDITIONS
|
177
|
+
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
179
|
+
|
180
|
+
To apply the Apache License to your work, attach the following
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "{}"
|
182
|
+
replaced with your own identifying information. (Don't include
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
184
|
+
comment syntax for the file format. We also recommend that a
|
185
|
+
file or class name and description of purpose be included on the
|
186
|
+
same "printed page" as the copyright notice for easier
|
187
|
+
identification within third-party archives.
|
188
|
+
|
189
|
+
Copyright {yyyy} {name of copyright owner}
|
190
|
+
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192
|
+
you may not use this file except in compliance with the License.
|
193
|
+
You may obtain a copy of the License at
|
194
|
+
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
196
|
+
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200
|
+
See the License for the specific language governing permissions and
|
201
|
+
limitations under the License.
|
data/README.md
CHANGED
@@ -4,14 +4,37 @@ Thai language utility for Ruby
|
|
4
4
|
|
5
5
|
I have built this project in order to collect and share tools for Thai language, which are written in Ruby language.
|
6
6
|
|
7
|
+
Installation
|
8
|
+
------------
|
9
|
+
|
10
|
+
> gem install thailang4r
|
11
|
+
|
12
|
+
Character level
|
13
|
+
---------------
|
14
|
+
|
7
15
|
* chlevel is similar th_chlevel in [libthai](http://linux.thai.net/projects/libthai).
|
8
16
|
* string_chlevel gives array of level back for example string_chlevel("กี") will return [1, 2]
|
9
17
|
|
10
18
|
Word breaker
|
11
19
|
------------
|
20
|
+
|
12
21
|
```ruby
|
13
22
|
# encoding: UTF-8
|
14
23
|
require 'thailang4r/word_breaker'
|
15
24
|
word_breaker = ThaiLang::WordBreaker.new
|
16
25
|
puts word_breaker.break_into_words("ฉันกินข้าว")
|
26
|
+
# ["ฉัน", "กิน", "ข้าว"]
|
27
|
+
```
|
28
|
+
|
29
|
+
Romanization
|
30
|
+
------------
|
31
|
+
|
32
|
+
A port of royin.py transliteration from [PyThaiNLP](https://pythainlp.github.io/).
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
# encoding: UTF-8
|
36
|
+
require 'thailang4r/roman'
|
37
|
+
royin = ThaiLang::Royin.new
|
38
|
+
p .romanize("ฉันกินข้าว", "-")
|
39
|
+
# => "chan-kin-khao"
|
17
40
|
```
|
@@ -0,0 +1,169 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require_relative 'word_breaker'
|
4
|
+
|
5
|
+
module ThaiLang
|
6
|
+
# Ported from https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/transliterate/royin.py (be1265d)
|
7
|
+
class Royin
|
8
|
+
vowel_patterns = """เ*ียว,\\1iao
|
9
|
+
แ*็ว,\\1aeo
|
10
|
+
เ*ือย,\\1ueai
|
11
|
+
แ*ว,\\1aeo
|
12
|
+
เ*็ว,\\1eo
|
13
|
+
เ*ว,\\1eo
|
14
|
+
*ิว,\\1io
|
15
|
+
*วย,\\1uai
|
16
|
+
เ*ย,\\1oei
|
17
|
+
*อย,\\1oi
|
18
|
+
โ*ย,\\1oi
|
19
|
+
*ุย,\\1ui
|
20
|
+
*าย,\\1ai
|
21
|
+
ไ*ย,\\1ai
|
22
|
+
*ัย,\\1ai
|
23
|
+
ไ**,\\1\\2ai
|
24
|
+
ไ*,\\1ai
|
25
|
+
ใ*,\\1ai
|
26
|
+
*ว*,\\1ua\\2
|
27
|
+
*ัวะ,\\1ua
|
28
|
+
*ัว,\\1ua
|
29
|
+
เ*ือะ,\\1uea
|
30
|
+
เ*ือ,\\1uea
|
31
|
+
เ*ียะ,\\1ia
|
32
|
+
เ*ีย,\\1ia
|
33
|
+
เ*อะ,\\1oe
|
34
|
+
เ*อ,\\1oe
|
35
|
+
เ*ิ,\\1oe
|
36
|
+
*อ,\\1o
|
37
|
+
เ*าะ,\\1o
|
38
|
+
เ*็,\\1e
|
39
|
+
โ*ะ,\\1o
|
40
|
+
โ*,\\1o
|
41
|
+
แ*ะ,\\1ae
|
42
|
+
แ*,\\1ae
|
43
|
+
เ*าะ,\\1e
|
44
|
+
*าว,\\1ao
|
45
|
+
เ*า,\\1ao
|
46
|
+
เ*,\\1e
|
47
|
+
*ู,\\1u
|
48
|
+
*ุ,\\1u
|
49
|
+
*ื,\\1ue
|
50
|
+
*ึ,\\1ue
|
51
|
+
*ี,\\1i
|
52
|
+
*ิ,\\1i
|
53
|
+
*ำ,\\1am
|
54
|
+
*า,\\1a
|
55
|
+
*ั,\\1a
|
56
|
+
*ะ,\\1a
|
57
|
+
#ฤ,\\1rue
|
58
|
+
$ฤ,\\1ri"""
|
59
|
+
|
60
|
+
VOWELS = vowel_patterns.gsub("*", "([ก-ฮ])")
|
61
|
+
.gsub("#", "([คนพมห])")
|
62
|
+
.gsub("$", "([กตทปศส])")
|
63
|
+
.split("\n")
|
64
|
+
.map {_1.split(",")}
|
65
|
+
.map {[Regexp.new(_1), _2]}
|
66
|
+
|
67
|
+
# พยัญชนะ ต้น สะกด
|
68
|
+
CONSONANTS = {
|
69
|
+
"ก" => ["k", "k"],
|
70
|
+
"ข" => ["kh", "k"],
|
71
|
+
"ฃ" => ["kh", "k"],
|
72
|
+
"ค" => ["kh", "k"],
|
73
|
+
"ฅ" => ["kh", "k"],
|
74
|
+
"ฆ" => ["kh", "k"],
|
75
|
+
"ง" => ["ng", "ng"],
|
76
|
+
"จ" => ["ch", "t"],
|
77
|
+
"ฉ" => ["ch", "t"],
|
78
|
+
"ช" => ["ch", "t"],
|
79
|
+
"ซ" => ["s", "t"],
|
80
|
+
"ฌ" => ["ch", "t"],
|
81
|
+
"ญ" => ["y", "n"],
|
82
|
+
"ฎ" => ["d", "t"],
|
83
|
+
"ฏ" => ["t", "t"],
|
84
|
+
"ฐ" => ["th", "t"],
|
85
|
+
# ฑ พยัญชนะต้น เป็น d ได้
|
86
|
+
"ฑ" => ["th", "t"],
|
87
|
+
"ฒ" => ["th", "t"],
|
88
|
+
"ณ" => ["n", "n"],
|
89
|
+
"ด" => ["d", "t"],
|
90
|
+
"ต" => ["t", "t"],
|
91
|
+
"ถ" => ["th", "t"],
|
92
|
+
"ท" => ["th", "t"],
|
93
|
+
"ธ" => ["th", "t"],
|
94
|
+
"น" => ["n", "n"],
|
95
|
+
"บ" => ["b", "p"],
|
96
|
+
"ป" => ["p", "p"],
|
97
|
+
"ผ" => ["ph", "p"],
|
98
|
+
"ฝ" => ["f", "p"],
|
99
|
+
"พ" => ["ph", "p"],
|
100
|
+
"ฟ" => ["f", "p"],
|
101
|
+
"ภ" => ["ph", "p"],
|
102
|
+
"ม" => ["m", "m"],
|
103
|
+
"ย" => ["y", ""],
|
104
|
+
"ร" => ["r", "n"],
|
105
|
+
"ฤ" => ["rue", ""],
|
106
|
+
"ล" => ["l", "n"],
|
107
|
+
"ว" => ["w", ""],
|
108
|
+
"ศ" => ["s", "t"],
|
109
|
+
"ษ" => ["s", "t"],
|
110
|
+
"ส" => ["s", "t"],
|
111
|
+
"ห" => ["h", ""],
|
112
|
+
"ฬ" => ["l", "n"],
|
113
|
+
"อ" => ["", ""],
|
114
|
+
"ฮ" => ["h", ""],
|
115
|
+
}
|
116
|
+
|
117
|
+
def normalize(word)
|
118
|
+
word.gsub(/จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ]ะ-ู์|[ฯๆ่-๏๚๛]/, "")
|
119
|
+
end
|
120
|
+
|
121
|
+
def replace_vowel(word)
|
122
|
+
VOWELS.each { word.gsub!(_1, _2) }
|
123
|
+
return word
|
124
|
+
end
|
125
|
+
|
126
|
+
def replace_consonants(word, consonants)
|
127
|
+
return word unless consonants
|
128
|
+
return word.gsub(consonants[0], CONSONANTS[consonants[0]][0]) if consonants.length == 1
|
129
|
+
consonants.reduce({rom: "", th: word}) do |w, consonant|
|
130
|
+
non_thai = w[:th].match(/^[^ก-์]+/)
|
131
|
+
if non_thai
|
132
|
+
w[:rom] += non_thai.to_s
|
133
|
+
w[:th] = w[:th][non_thai.to_s.length..-1]
|
134
|
+
end
|
135
|
+
if w[:skip]
|
136
|
+
{rom: w[:rom], th: w[:th]}
|
137
|
+
elsif w[:rom] == "" and w[:th] == "ห"
|
138
|
+
{rom: "", th: w[:th][1..-1]}
|
139
|
+
elsif w[:rom] == ""
|
140
|
+
{rom: CONSONANTS[consonant][0], th: w[:th][consonant.length..-1]}
|
141
|
+
elsif consonant == "ร" and w[:th] == "รร"
|
142
|
+
{rom: w[:rom] + "an", th: w[:th][2..-1], skip: true}
|
143
|
+
elsif consonant == "ร" and w[:th][0..1] == "รร"
|
144
|
+
{rom: w[:rom] + "a", th: w[:th][2..-1], skip: true}
|
145
|
+
else
|
146
|
+
{rom: w[:rom] + CONSONANTS[consonant][1], th: w[:th][consonant.length..-1]}
|
147
|
+
end
|
148
|
+
end[:rom]
|
149
|
+
end
|
150
|
+
|
151
|
+
def romanize_word(word)
|
152
|
+
word = replace_vowel(normalize(word))
|
153
|
+
consonants = word.scan(/[ก-ฮ]/)
|
154
|
+
if word.length == 2 and consonants.length == 2
|
155
|
+
word = word.chars
|
156
|
+
word.insert(1, "o")
|
157
|
+
word = word.join("")
|
158
|
+
end
|
159
|
+
word = replace_consonants(word, consonants)
|
160
|
+
return word
|
161
|
+
end
|
162
|
+
|
163
|
+
WORDCUT = WordBreaker.new
|
164
|
+
|
165
|
+
def romanize(text, delim = "")
|
166
|
+
WORDCUT.break_into_words(text).map { romanize_word _1 }.join(delim)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -1,29 +1,254 @@
|
|
1
|
-
|
2
|
-
require 'thailang4r/dict.rb'
|
3
|
-
require 'thailang4r/word_dag_builder.rb'
|
4
|
-
require 'thailang4r/ranges_builder.rb'
|
1
|
+
# coding: utf-8
|
5
2
|
|
6
3
|
module ThaiLang
|
4
|
+
NODE_KEY_ROW_NO = 0
|
5
|
+
NODE_KEY_OFFSET = 1
|
6
|
+
NODE_KEY_CH = 2
|
7
|
+
|
8
|
+
NODE_PTR_ROW_NO = 0
|
9
|
+
NODE_PTR_IS_FINAL = 1
|
10
|
+
NODE_PTR_PAYLOAD = 2
|
11
|
+
|
12
|
+
NodeKey = Struct.new(:row_no, :offset, :ch)
|
13
|
+
|
14
|
+
class PrefixTree
|
15
|
+
def initialize(sorted_words_with_payload)
|
16
|
+
@prefix_tree = {}
|
17
|
+
sorted_words_with_payload.each.with_index do |(w, payload), i|
|
18
|
+
row_no = 0
|
19
|
+
ch_vec = w.codepoints
|
20
|
+
ch_len = w.length
|
21
|
+
ch_vec.each.with_index do |ch, j|
|
22
|
+
node_key = NodeKey.new(row_no, j, ch)
|
23
|
+
ex_node_ptr = @prefix_tree[node_key]
|
24
|
+
if ex_node_ptr
|
25
|
+
row_no = ex_node_ptr[NODE_PTR_ROW_NO]
|
26
|
+
else
|
27
|
+
is_final = (j + 1 == ch_len)
|
28
|
+
node_ptr = [i, is_final, if is_final; payload; end]
|
29
|
+
@prefix_tree[node_key] = node_ptr
|
30
|
+
row_no = i
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def lookup(row_id, offset, ch)
|
37
|
+
@prefix_tree[NodeKey.new(row_id, offset, ch)]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
DEFAULT_THAI_DICT_PATH = File.expand_path('../../../data/tdict-std.txt', __FILE__)
|
42
|
+
|
7
43
|
class WordBreaker
|
44
|
+
def initialize(dix_path = DEFAULT_THAI_DICT_PATH)
|
45
|
+
@dix = PrefixTree.new(File.open(dix_path).each_line.map { [_1.chomp, 1] })
|
46
|
+
end
|
47
|
+
|
48
|
+
def break_into_words(text)
|
49
|
+
tokenize(@dix, text)
|
50
|
+
end
|
51
|
+
|
52
|
+
UNK = 1
|
53
|
+
DICT = 2
|
54
|
+
INIT = 3
|
55
|
+
LATIN = 4
|
56
|
+
PUNC = 5
|
57
|
+
|
58
|
+
LINK_P_IDX = 0
|
59
|
+
LINK_W = 1
|
60
|
+
LINK_UNK = 2
|
61
|
+
LINK_KIND = 3
|
62
|
+
|
63
|
+
def better_link?(l, r)
|
64
|
+
l[LINK_UNK] < r[LINK_UNK] or l[LINK_W] < r[LINK_W]
|
65
|
+
end
|
66
|
+
|
67
|
+
WAITING = 1
|
68
|
+
ACTIVATED = 2
|
69
|
+
COMPLETED = 3
|
8
70
|
|
9
|
-
|
10
|
-
|
71
|
+
CAP_A = "A".ord
|
72
|
+
CAP_Z = "Z".ord
|
73
|
+
A = "a".ord
|
74
|
+
Z = "z".ord
|
11
75
|
|
12
|
-
def
|
13
|
-
|
14
|
-
|
15
|
-
|
76
|
+
def latin?(ch)
|
77
|
+
(ch >= CAP_A and ch <= CAP_Z) or (ch >= A and ch <= Z)
|
78
|
+
end
|
79
|
+
|
80
|
+
TRANSDUCER_S = 0
|
81
|
+
TRANSDUCER_E = 1
|
82
|
+
TRANSDUCER_STATE = 3
|
83
|
+
TRANSDUCER_KIND = 4
|
84
|
+
|
85
|
+
def update_latin_transducer(transducer, ch, i, ch_vec)
|
86
|
+
if transducer[TRANSDUCER_STATE] == WAITING
|
87
|
+
if latin?(ch)
|
88
|
+
transducer[TRANSDUCER_S] = i
|
89
|
+
transducer[TRANSDUCER_STATE] = ACTIVATED
|
90
|
+
if i + 1 == ch_vec.length or not latin?(ch_vec[i + 1])
|
91
|
+
transducer[TRANSDUCER_E] = i + 1
|
92
|
+
transducer[TRANSDUCER_STATE] = COMPLETED
|
93
|
+
end
|
94
|
+
end
|
95
|
+
else
|
96
|
+
if latin?(ch)
|
97
|
+
transducer[TRANSDUCER_E] = i + 1
|
98
|
+
transducer[TRANSDUCER_STATE] = COMPLETED
|
99
|
+
else
|
100
|
+
transducer[TRANSDUCER_STATE] = WAITING
|
101
|
+
end
|
16
102
|
end
|
17
|
-
@dict = Dict.new path
|
18
|
-
@dag_builder = WordDagBuilder.new @dict
|
19
|
-
@ranges_builder = RangesBuilder.new
|
20
103
|
end
|
104
|
+
|
105
|
+
|
106
|
+
SPACE = " ".ord
|
21
107
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
108
|
+
def punc?(ch)
|
109
|
+
ch == SPACE
|
110
|
+
end
|
111
|
+
|
112
|
+
def update_punc_transducer(transducer, ch, i, ch_vec)
|
113
|
+
if transducer[TRANSDUCER_STATE] == WAITING
|
114
|
+
if punc?(ch)
|
115
|
+
transducer[TRANSDUCER_S] = i
|
116
|
+
transducer[TRANSDUCER_STATE] = ACTIVATED
|
117
|
+
if i + 1 == ch_vec.length or not punc?(ch_vec[i + 1])
|
118
|
+
transducer[TRANSDUCER_E] = i + 1
|
119
|
+
transducer[TRANSDUCER_STATE] = COMPLETED
|
120
|
+
end
|
121
|
+
end
|
122
|
+
else
|
123
|
+
if punc?(ch)
|
124
|
+
transducer[TRANSDUCER_E] = i + 1
|
125
|
+
transducer[TRANSDUCER_STATE] = COMPLETED
|
126
|
+
else
|
127
|
+
transducer[TRANSDUCER_STATE] = WAITING
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
DIX_PTR_S = 0
|
133
|
+
DIX_PTR_ROW_NO = 1
|
134
|
+
DIX_PTR_IS_FINAL = 2
|
135
|
+
|
136
|
+
def build_path(dix, s)
|
137
|
+
left_boundary = 0
|
138
|
+
ch_vec = s.codepoints
|
139
|
+
ch_len = ch_vec.length
|
140
|
+
path = [[0,0,0,INIT]]
|
141
|
+
dix_ptrs = []
|
142
|
+
latin_transducer = [0,0,WAITING,LATIN]
|
143
|
+
punc_transducer =[0,0,WAITING,PUNC]
|
144
|
+
ch_vec.each.with_index do |ch, i|
|
145
|
+
dix_ptrs << [i, 0, false]
|
146
|
+
unk_link = path[left_boundary]
|
147
|
+
link = [left_boundary, unk_link[LINK_W] + 1, unk_link[LINK_UNK] + 1, UNK]
|
148
|
+
j = 0
|
149
|
+
while j < dix_ptrs.length
|
150
|
+
dix_ptr = dix_ptrs[j]
|
151
|
+
offset = i - dix_ptr[DIX_PTR_S]
|
152
|
+
row_no = dix_ptr[DIX_PTR_ROW_NO]
|
153
|
+
child = dix.lookup(row_no, offset, ch)
|
154
|
+
# puts "ch:#{ch} offset:#{offset} rowno:#{row_no} child:#{child}"
|
155
|
+
if child
|
156
|
+
dix_ptrs[j] = [dix_ptr[DIX_PTR_S], child[NODE_PTR_ROW_NO], child[NODE_PTR_IS_FINAL]]
|
157
|
+
j += 1
|
158
|
+
else
|
159
|
+
unless j + 1 == dix_ptrs.length
|
160
|
+
dix_ptrs[j] = dix_ptrs.pop
|
161
|
+
else
|
162
|
+
dix_ptrs.pop
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
update_latin_transducer(latin_transducer, ch, i, ch_vec)
|
168
|
+
update_punc_transducer(punc_transducer, ch, i, ch_vec)
|
169
|
+
|
170
|
+
dix_ptrs.each do |dix_ptr|
|
171
|
+
if dix_ptr[DIX_PTR_IS_FINAL]
|
172
|
+
new_s = dix_ptr[DIX_PTR_S]
|
173
|
+
# puts "NEW_S:#{new_s} DIX-PTR:#{dix_ptr} i:#{i}"
|
174
|
+
prev_link = path[new_s]
|
175
|
+
w = prev_link[LINK_W]
|
176
|
+
unk = prev_link[LINK_UNK]
|
177
|
+
new_link = [new_s, w + 1, unk, DICT]
|
178
|
+
link = new_link if better_link?(new_link, link)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
if latin_transducer[TRANSDUCER_STATE] == COMPLETED
|
183
|
+
s = latin_transducer[TRANSDUCER_S]
|
184
|
+
prev_link = path[s]
|
185
|
+
w = prev_link[LINK_W]
|
186
|
+
unk = prev_link[LINK_UNK]
|
187
|
+
new_link = [s, w + 1, unk, LATIN]
|
188
|
+
link = new_link if better_link?(new_link, link)
|
189
|
+
end
|
190
|
+
|
191
|
+
if punc_transducer[TRANSDUCER_STATE] == COMPLETED
|
192
|
+
s = punc_transducer[TRANSDUCER_S]
|
193
|
+
prev_link = path[s]
|
194
|
+
w = prev_link[LINK_W]
|
195
|
+
unk = prev_link[LINK_UNK]
|
196
|
+
new_link = [s, w + 1, unk, PUNC]
|
197
|
+
link = new_link if better_link?(new_link, link)
|
198
|
+
end
|
199
|
+
left_boundary = i if link[LINK_KIND] != UNK
|
200
|
+
path << link
|
201
|
+
end
|
202
|
+
path
|
203
|
+
end
|
204
|
+
|
205
|
+
RANGE_S = 0
|
206
|
+
RANGE_E = 1
|
207
|
+
|
208
|
+
def path_to_ranges(path)
|
209
|
+
e = path.length - 1
|
210
|
+
ranges = []
|
211
|
+
while e > 0
|
212
|
+
link = path[e]
|
213
|
+
s = link[LINK_P_IDX]
|
214
|
+
ranges << [s,e]
|
215
|
+
e = s
|
216
|
+
end
|
217
|
+
ranges.reverse
|
218
|
+
end
|
219
|
+
|
220
|
+
def ranges_to_toks(ranges, str)
|
221
|
+
ranges.map {|s,e| str[s...e]}
|
222
|
+
end
|
223
|
+
|
224
|
+
def tokenize(dix, str)
|
225
|
+
ranges_to_toks(path_to_ranges(build_path(dix, str)), str)
|
226
|
+
end
|
227
|
+
|
228
|
+
def tokenize_with_delim(dix, str, delim)
|
229
|
+
tokenize(dix, str).join(delim)
|
230
|
+
end
|
28
231
|
end
|
29
|
-
end
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
#dix = APrefixTree.new([["กา",1],["กาก",1]])
|
236
|
+
#p dix
|
237
|
+
#p tokenize_with_delim(dix, "บทความนี้ใช้ระบบคริสต์ศักราช เพราะอ้างอิงคริสต์ศักราชและคริสต์ศตวรรษ หรืออย่างใดอย่างหนึ่ง", "|")
|
238
|
+
|
239
|
+
|
240
|
+
#t2 = [0,0,WAITING,LATIN]
|
241
|
+
#update_punc_transducer(t2, 32, 0, [32])
|
242
|
+
#p t2
|
243
|
+
#t1 = PrefixTree.new([["A",1]])
|
244
|
+
#p t1.lookup(0, 0, "A".codepoints[0])
|
245
|
+
|
246
|
+
#t2 = [0,0,WAITING,LATIN]
|
247
|
+
#update_punc_transducer(t2, 32, 0, [32])
|
248
|
+
#p t2
|
249
|
+
#t1 = PrefixTree.new([["A",1]])
|
250
|
+
#p t1.lookup(0, 0, "A".codepoints[0])
|
251
|
+
|
252
|
+
|
253
|
+
#word_breaker = ThaiLang::WordBreaker.new
|
254
|
+
#puts word_breaker.break_into_words("ฉันกินข้าว")
|
metadata
CHANGED
@@ -1,71 +1,53 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thailang4r
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Vee Satayamas
|
9
|
-
autorequire:
|
8
|
+
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
dependencies:
|
14
|
-
|
15
|
-
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ~>
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: 1.2.1
|
22
|
-
type: :development
|
23
|
-
prerelease: false
|
24
|
-
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ~>
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 1.2.1
|
30
|
-
description: Thai language utility for Ruby
|
11
|
+
date: 2021-04-26 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Thai language tools for Ruby, i.e. a word tokenizer, a character level
|
14
|
+
indentifier, and a romanization tool
|
31
15
|
email:
|
32
|
-
-
|
16
|
+
- 5ssgdxltv@relay.firefox.com
|
33
17
|
executables: []
|
34
18
|
extensions: []
|
35
19
|
extra_rdoc_files: []
|
36
20
|
files:
|
37
|
-
- lib/thailang4r.rb
|
38
|
-
- lib/thailang4r/word_dag_builder.rb
|
39
|
-
- lib/thailang4r/dict.rb
|
40
|
-
- lib/thailang4r/ranges_builder.rb
|
41
|
-
- lib/thailang4r/word_breaker.rb
|
42
21
|
- LICENSE
|
43
22
|
- README.md
|
44
23
|
- Rakefile
|
45
|
-
- data/test_dict.txt
|
46
24
|
- data/tdict-std.txt
|
25
|
+
- data/test_dict.txt
|
26
|
+
- lib/thailang4r.rb
|
27
|
+
- lib/thailang4r/roman.rb
|
28
|
+
- lib/thailang4r/roman.rb~
|
29
|
+
- lib/thailang4r/word_breaker.rb
|
47
30
|
homepage: https://github.com/veer66/thailang4r
|
48
|
-
licenses:
|
49
|
-
|
31
|
+
licenses:
|
32
|
+
- Apache-2.0
|
33
|
+
metadata: {}
|
34
|
+
post_install_message:
|
50
35
|
rdoc_options: []
|
51
36
|
require_paths:
|
52
37
|
- lib
|
53
38
|
required_ruby_version: !ruby/object:Gem::Requirement
|
54
|
-
none: false
|
55
39
|
requirements:
|
56
|
-
- -
|
40
|
+
- - ">="
|
57
41
|
- !ruby/object:Gem::Version
|
58
|
-
version:
|
42
|
+
version: 3.0.0
|
59
43
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
-
none: false
|
61
44
|
requirements:
|
62
|
-
- -
|
45
|
+
- - ">="
|
63
46
|
- !ruby/object:Gem::Version
|
64
47
|
version: '0'
|
65
48
|
requirements: []
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
specification_version: 3
|
49
|
+
rubygems_version: 3.2.3
|
50
|
+
signing_key:
|
51
|
+
specification_version: 4
|
70
52
|
summary: Thai language utility for Ruby
|
71
53
|
test_files: []
|
data/lib/thailang4r/dict.rb
DELETED
@@ -1,89 +0,0 @@
|
|
1
|
-
module ThaiLang
|
2
|
-
class Dict
|
3
|
-
def initialize(file_path)
|
4
|
-
load_dict(file_path)
|
5
|
-
end
|
6
|
-
|
7
|
-
def load_dict(file_path)
|
8
|
-
File.open(file_path) do |f|
|
9
|
-
@str_list = f.readlines.map{|line| line.chomp}
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
14
|
-
find_index_of_needle(:FIRST, prefix, offset, s, e)
|
15
|
-
end
|
16
|
-
|
17
|
-
def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
18
|
-
find_index_of_needle(:LAST, prefix, offset, s, e)
|
19
|
-
end
|
20
|
-
|
21
|
-
def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
|
22
|
-
offset = offset.nil? ? 0 : offset
|
23
|
-
s = s.nil? ? 0 : s
|
24
|
-
e = e.nil? ? @str_list.length : e
|
25
|
-
|
26
|
-
l = s
|
27
|
-
r = e - 1;
|
28
|
-
ans = nil
|
29
|
-
|
30
|
-
while l <= r do
|
31
|
-
m = (l + r) / 2
|
32
|
-
ch = @str_list[m][offset]
|
33
|
-
if ch.nil? or prefix > ch
|
34
|
-
l = m + 1
|
35
|
-
elsif prefix < ch
|
36
|
-
r = m - 1
|
37
|
-
else
|
38
|
-
ans = m
|
39
|
-
if pos_type == :FIRST
|
40
|
-
r = m - 1
|
41
|
-
else
|
42
|
-
l = m + 1
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
ans
|
48
|
-
end
|
49
|
-
|
50
|
-
def size
|
51
|
-
@str_list.length
|
52
|
-
end
|
53
|
-
|
54
|
-
def [](i)
|
55
|
-
@str_list[i]
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
class DictIter
|
60
|
-
def initialize(dict)
|
61
|
-
@dict = dict
|
62
|
-
@e = @dict.size
|
63
|
-
@s = 0
|
64
|
-
@state = :ACTIVE
|
65
|
-
@offset = 0
|
66
|
-
end
|
67
|
-
|
68
|
-
def walk(ch)
|
69
|
-
if @state != :INVALID
|
70
|
-
first = @dict.find_first_index_of_needle ch, @offset, @s, @e
|
71
|
-
if first.nil?
|
72
|
-
@state = :INVALID
|
73
|
-
else
|
74
|
-
@s = first
|
75
|
-
last = @dict.find_last_index_of_needle ch, @offset, @s, @e
|
76
|
-
@e = last + 1
|
77
|
-
len = @dict[first].length
|
78
|
-
@offset += 1
|
79
|
-
if(@offset == len)
|
80
|
-
@state = :ACTIVE_BOUNDARY
|
81
|
-
else
|
82
|
-
@state = :ACTIVE
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
@state
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
@@ -1,90 +0,0 @@
|
|
1
|
-
module ThaiLang
|
2
|
-
class RangesBuilder
|
3
|
-
S = 0
|
4
|
-
E = 1
|
5
|
-
LINK_TYPE = 2
|
6
|
-
|
7
|
-
POINTER = 0
|
8
|
-
WEIGHT = 1
|
9
|
-
PATH_UNK = 2
|
10
|
-
PATH_LINK_TYPE = 3
|
11
|
-
|
12
|
-
def _build_index(dag, pos)
|
13
|
-
index = {}
|
14
|
-
dag.each do |range|
|
15
|
-
if not index.has_key?(range[pos])
|
16
|
-
index[range[pos]] = []
|
17
|
-
end
|
18
|
-
index[range[pos]] << range
|
19
|
-
end
|
20
|
-
index
|
21
|
-
end
|
22
|
-
|
23
|
-
def _build_e_index(dag)
|
24
|
-
_build_index(dag, E)
|
25
|
-
end
|
26
|
-
|
27
|
-
def _build_s_index(dag)
|
28
|
-
_build_index(dag, S)
|
29
|
-
end
|
30
|
-
|
31
|
-
def _compare_path_info(a, b)
|
32
|
-
a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
|
33
|
-
end
|
34
|
-
|
35
|
-
def _build_path(len, s_index, e_index)
|
36
|
-
path = Array.new(len + 1) {|i| nil}
|
37
|
-
path[0] = [0, 0, 0, :UNK]
|
38
|
-
left_boundary = 0
|
39
|
-
for i in 1..len
|
40
|
-
if e_index.has_key?(i)
|
41
|
-
e_index[i].each do |range|
|
42
|
-
s = range[S]
|
43
|
-
if not path[s].nil?
|
44
|
-
info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
|
45
|
-
if path[i].nil? or _compare_path_info(info, path[i])
|
46
|
-
path[i] = info
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
if not path[i].nil?
|
51
|
-
left_boundary = i
|
52
|
-
end
|
53
|
-
end
|
54
|
-
if path[i].nil? and s_index.has_key?(i)
|
55
|
-
info = [left_boundary,
|
56
|
-
path[left_boundary][WEIGHT] + 1,
|
57
|
-
path[left_boundary][PATH_UNK] + 1,
|
58
|
-
:UNK]
|
59
|
-
path[i] = info;
|
60
|
-
end
|
61
|
-
end
|
62
|
-
if path[len].nil?
|
63
|
-
path[len] = [left_boundary,
|
64
|
-
path[left_boundary][WEIGHT] + 1,
|
65
|
-
path[left_boundary][PATH_UNK] + 1, :UNK]
|
66
|
-
end
|
67
|
-
path
|
68
|
-
end
|
69
|
-
|
70
|
-
def _path_to_ranges(path, len)
|
71
|
-
ranges = []
|
72
|
-
i = len
|
73
|
-
while i > 0
|
74
|
-
info = path[i]
|
75
|
-
s = info[POINTER]
|
76
|
-
ranges << [s, i, info[PATH_LINK_TYPE]]
|
77
|
-
i = s
|
78
|
-
end
|
79
|
-
ranges.reverse
|
80
|
-
end
|
81
|
-
|
82
|
-
def build_from_dag(dag, len)
|
83
|
-
s_index = _build_s_index(dag)
|
84
|
-
e_index = _build_e_index(dag)
|
85
|
-
path = _build_path(len, s_index, e_index)
|
86
|
-
_path_to_ranges(path, len)
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
@@ -1,78 +0,0 @@
|
|
1
|
-
module ThaiLang
|
2
|
-
class WordDagBuilder
|
3
|
-
def initialize(dict)
|
4
|
-
@dict = dict
|
5
|
-
end
|
6
|
-
|
7
|
-
def build(string, len)
|
8
|
-
dag = []
|
9
|
-
_build_by_dict(dag, string, len)
|
10
|
-
#_build_by_latin_rule(dag, string, len)
|
11
|
-
dag.sort do |a,b|
|
12
|
-
r = 0
|
13
|
-
for i in 0..2
|
14
|
-
r = a[i] <=> b[i]
|
15
|
-
if r != 0
|
16
|
-
break
|
17
|
-
end
|
18
|
-
end
|
19
|
-
r
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def _build_by_latin_rule(dag, string, len)
|
24
|
-
next_latin = 0
|
25
|
-
for i in 0..(len-1)
|
26
|
-
space_e = nil
|
27
|
-
latin_e = nil
|
28
|
-
space_break = false
|
29
|
-
latin_break = false
|
30
|
-
|
31
|
-
for j in i..(len-1)
|
32
|
-
if space_break and latin_break
|
33
|
-
break
|
34
|
-
end
|
35
|
-
ch = string[j]
|
36
|
-
if not space_break
|
37
|
-
if ch == " "
|
38
|
-
space_e = j + 1
|
39
|
-
else
|
40
|
-
space_break = true
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
if latin_break and j >= next_latin
|
45
|
-
if /A-Za-z/.match(ch)
|
46
|
-
latin_e = j + 1
|
47
|
-
else
|
48
|
-
latin_break = true
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
if not space_e.nil?
|
54
|
-
dag << [i, space_e, :SPACE]
|
55
|
-
end
|
56
|
-
if not latin_e.nil?
|
57
|
-
dag << [i, latin_e, :LATIN]
|
58
|
-
next_latin = latin_e;
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def _build_by_dict(dag, string, len)
|
64
|
-
for i in 0..(len-1)
|
65
|
-
iter = DictIter.new @dict
|
66
|
-
for j in i..(len-1)
|
67
|
-
ch = string[j]
|
68
|
-
status = iter.walk ch
|
69
|
-
if status == :INVALID
|
70
|
-
break
|
71
|
-
elsif status == :ACTIVE_BOUNDARY
|
72
|
-
dag << [i, j + 1, :DICT]
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|