wordcut 0.0.1 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +165 -0
- data/data/tha/tdict-acronyms.txt +471 -0
- data/data/tha/tdict-city.txt +242 -0
- data/data/tha/tdict-collection.txt +54 -0
- data/data/tha/tdict-common.txt +1234 -0
- data/data/tha/tdict-country.txt +217 -0
- data/data/tha/tdict-district.txt +114 -0
- data/data/tha/tdict-geo.txt +53 -0
- data/data/tha/tdict-history.txt +35 -0
- data/data/tha/tdict-ict.txt +260 -0
- data/data/tha/tdict-lang-ethnic.txt +43 -0
- data/data/tha/tdict-proper.txt +393 -0
- data/data/tha/tdict-science.txt +146 -0
- data/data/tha/tdict-spell.txt +82 -0
- data/data/tha/tdict-std-compound.txt +5493 -0
- data/data/tha/tdict-std.txt +15374 -0
- data/wordcut/dag.rb +82 -78
- data/wordcut/dict.rb +39 -38
- data/wordcut/dict_seek.rb +24 -22
- data/wordcut/edge.rb +21 -17
- data/wordcut/edge_builder.rb +16 -14
- data/wordcut/pointer.rb +27 -24
- data/wordcut/space_slicer.rb +25 -21
- data/wordcut/tokenizer.rb +11 -9
- metadata +19 -3
data/wordcut/dag.rb
CHANGED
@@ -2,98 +2,102 @@ require_relative "edge_builder"
|
|
2
2
|
require_relative "pointer"
|
3
3
|
require_relative "space_slicer"
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
|
6
|
+
module Wordcut
|
7
|
+
module DictDagUpdater
|
8
|
+
def update_by_dict(i, pointers)
|
9
|
+
edge = self.build_edges(pointers).min
|
10
|
+
self[i] = edge
|
11
|
+
return i
|
12
|
+
end
|
10
13
|
end
|
11
|
-
end
|
12
14
|
|
13
|
-
module UnkDagUpdater
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
module UnkDagUpdater
|
16
|
+
def update_by_unk(i, left)
|
17
|
+
src = self[left]
|
18
|
+
edge = Edge.new(:s => left,
|
19
|
+
:unk => src.unk + 1,
|
20
|
+
:chunk => src.chunk + 1,
|
21
|
+
:etype => :UNK,
|
22
|
+
:payload => nil)
|
23
|
+
self[i] = edge
|
24
|
+
return left
|
25
|
+
end
|
23
26
|
end
|
24
|
-
end
|
25
27
|
|
26
|
-
module SpaceDagUpdater
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
module SpaceDagUpdater
|
29
|
+
def update_by_space(i, slicer)
|
30
|
+
s = slicer.s
|
31
|
+
src = self[s]
|
32
|
+
edge = Edge.new(:s => s,
|
33
|
+
:unk => src.unk,
|
34
|
+
:chunk => src.chunk + 1,
|
35
|
+
:etype => :SPACE,
|
36
|
+
:payload => nil)
|
37
|
+
self[i] = edge
|
38
|
+
return i
|
39
|
+
end
|
37
40
|
end
|
38
|
-
end
|
39
41
|
|
40
|
-
module BasicDagUpdater
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
42
|
+
module BasicDagUpdater
|
43
|
+
include DictDagUpdater
|
44
|
+
include UnkDagUpdater
|
45
|
+
include SpaceDagUpdater
|
46
|
+
include PointersManipulator
|
47
|
+
|
48
|
+
def update(i, left, pointers, space_slicer)
|
49
|
+
if not pointers&.empty?
|
50
|
+
update_by_dict(i, pointers)
|
51
|
+
elsif space_slicer&.final
|
52
|
+
update_by_space(i, space_slicer)
|
53
|
+
else
|
54
|
+
update_by_unk(i, left)
|
55
|
+
end
|
53
56
|
end
|
54
57
|
end
|
55
|
-
end
|
56
58
|
|
57
|
-
module DagBuilder
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
59
|
+
module DagBuilder
|
60
|
+
def build(dict, txt)
|
61
|
+
self[0] = init_edge
|
62
|
+
pointers = []
|
63
|
+
left = 0
|
64
|
+
space_slicer = SpaceSlicer.new(0)
|
65
|
+
for i in 1..txt.length
|
66
|
+
ch = txt[i - 1]
|
67
|
+
next_ch = i < txt.length ? txt[i] : nil
|
68
|
+
space_slicer.transit(ch, next_ch)
|
69
|
+
pointers << new_pointer(i, dict)
|
70
|
+
pointers = transit(pointers, ch)
|
71
|
+
left = update(i, left, pointers.select(&:final), space_slicer)
|
72
|
+
end
|
70
73
|
end
|
71
74
|
end
|
72
|
-
end
|
73
75
|
|
74
|
-
module DagToToken
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
76
|
+
module DagToToken
|
77
|
+
def tokens(txt)
|
78
|
+
toks = []
|
79
|
+
i = txt.length
|
80
|
+
while i > 0
|
81
|
+
s = self[i].s
|
82
|
+
tok = txt.slice(s, i-s)
|
83
|
+
toks << tok
|
84
|
+
i = s
|
85
|
+
end
|
86
|
+
toks.reverse
|
83
87
|
end
|
84
|
-
toks.reverse
|
85
88
|
end
|
86
|
-
end
|
87
89
|
|
88
|
-
class BasicDag < Array
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
class BasicDag < Array
|
91
|
+
include EdgeBuilder
|
92
|
+
include BasicDagUpdater
|
93
|
+
include DagBuilder
|
94
|
+
include DagToToken
|
95
|
+
|
96
|
+
def self.build(dict, txt)
|
97
|
+
dag = BasicDag.new(txt.length + 1)
|
98
|
+
dag.build(dict, txt)
|
99
|
+
return dag
|
100
|
+
end
|
98
101
|
end
|
102
|
+
|
99
103
|
end
|
data/wordcut/dict.rb
CHANGED
@@ -1,51 +1,52 @@
|
|
1
1
|
require_relative "dict_seek"
|
2
|
-
|
3
|
-
class WordItem
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
9
|
-
|
10
|
-
module DictInfo
|
11
|
-
def l
|
12
|
-
0
|
2
|
+
module Wordcut
|
3
|
+
class WordItem
|
4
|
+
attr_reader :headword
|
5
|
+
def initialize(headword)
|
6
|
+
@headword = headword
|
7
|
+
end
|
13
8
|
end
|
14
9
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
10
|
+
module DictInfo
|
11
|
+
def l
|
12
|
+
0
|
13
|
+
end
|
20
14
|
|
21
|
-
|
22
|
-
|
23
|
-
|
15
|
+
def r
|
16
|
+
return nil if self.empty?
|
17
|
+
self.length - 1
|
18
|
+
end
|
24
19
|
end
|
25
|
-
end
|
26
20
|
|
27
|
-
module
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
module PathResolver
|
22
|
+
def resolve_path(lang, name)
|
23
|
+
File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
|
24
|
+
end
|
31
25
|
end
|
32
26
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
27
|
+
module BasicDictLoader
|
28
|
+
include PathResolver
|
29
|
+
def load_bundle(lang, name)
|
30
|
+
load(resolve_path(lang, name))
|
31
|
+
end
|
32
|
+
|
33
|
+
def load(path)
|
34
|
+
self.concat(open(path).each_line
|
35
|
+
.map(&:strip)
|
36
|
+
.reject(&:empty?)
|
37
|
+
.map{|w| WordItem.new w})
|
38
|
+
end
|
38
39
|
end
|
39
|
-
end
|
40
40
|
|
41
|
-
class BasicDict < Array
|
42
|
-
|
43
|
-
|
44
|
-
|
41
|
+
class BasicDict < Array
|
42
|
+
include DictInfo
|
43
|
+
include DictSeeker
|
44
|
+
include BasicDictLoader
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def self.from_bundle(lang, name)
|
47
|
+
dict = self.new
|
48
|
+
dict.load_bundle(lang, name)
|
49
|
+
return dict
|
50
|
+
end
|
50
51
|
end
|
51
52
|
end
|
data/wordcut/dict_seek.rb
CHANGED
@@ -1,28 +1,30 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module Wordcut
|
2
|
+
module DictSeeker
|
3
|
+
def seek(ch, l, r, offset, policy)
|
4
|
+
idx = nil
|
5
|
+
while l <= r
|
6
|
+
m = (l + r) / 2
|
7
|
+
w = self[m].headword
|
8
|
+
wlen = w.length
|
8
9
|
|
9
|
-
|
10
|
-
l = m + 1
|
11
|
-
else
|
12
|
-
ch_w = w[offset]
|
13
|
-
if ch_w < ch
|
10
|
+
if wlen <= offset
|
14
11
|
l = m + 1
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
12
|
+
else
|
13
|
+
ch_w = w[offset]
|
14
|
+
if ch_w < ch
|
15
|
+
l = m + 1
|
16
|
+
elsif ch_w > ch
|
17
|
+
r = m - 1
|
18
|
+
elsif policy == :LEFT
|
19
|
+
idx = m
|
20
|
+
r = m - 1
|
21
|
+
elsif policy == :RIGHT
|
22
|
+
idx = m
|
23
|
+
l = m + 1
|
24
|
+
end
|
25
|
+
end
|
24
26
|
end
|
27
|
+
return idx
|
25
28
|
end
|
26
|
-
return idx
|
27
29
|
end
|
28
30
|
end
|
data/wordcut/edge.rb
CHANGED
@@ -1,21 +1,25 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
1
|
+
module Wordcut
|
2
|
+
|
3
|
+
class Edge
|
4
|
+
attr_reader :unk, :chunk, :s, :payload, :etype
|
5
|
+
|
6
|
+
CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
|
7
|
+
|
8
|
+
def initialize(args = {})
|
9
|
+
@unk = args[:unk] || 0
|
10
|
+
@chunk = args[:chunk] || 0
|
11
|
+
@s = args[:s] || 0
|
12
|
+
@payload = args[:payload]
|
13
|
+
@etype = args[:etype]
|
14
|
+
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
def <=>(o)
|
17
|
+
for fn in CMP_FUNCS
|
18
|
+
cmp = fn.call(self) <=> fn.call(o)
|
19
|
+
return cmp if cmp != 0
|
20
|
+
end
|
21
|
+
return 0
|
18
22
|
end
|
19
|
-
return 0
|
20
23
|
end
|
24
|
+
|
21
25
|
end
|
data/wordcut/edge_builder.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
require_relative "edge.rb"
|
2
2
|
|
3
|
-
module
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
pointers
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
module Wordcut
|
4
|
+
module EdgeBuilder
|
5
|
+
def init_edge
|
6
|
+
Edge.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def build_edges(pointers)
|
10
|
+
pointers.map do |pointer|
|
11
|
+
src = self[pointer.s]
|
12
|
+
Edge.new(:s => pointer.s,
|
13
|
+
:unk => src.unk,
|
14
|
+
:chunk => src.chunk + 1,
|
15
|
+
:etype => :DICT,
|
16
|
+
:payload => nil)
|
17
|
+
end
|
18
|
+
end
|
17
19
|
end
|
18
20
|
end
|
data/wordcut/pointer.rb
CHANGED
@@ -1,29 +1,32 @@
|
|
1
|
-
|
2
|
-
attr_reader :s, :l, :r, :offset, :dict, :final
|
3
|
-
def initialize(s, l, r, offset, dict, final=false)
|
4
|
-
@s = s
|
5
|
-
@l = l
|
6
|
-
@r = r
|
7
|
-
@offset = offset
|
8
|
-
@dict = dict
|
9
|
-
@final = final
|
10
|
-
end
|
1
|
+
module Wordcut
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
3
|
+
class Pointer
|
4
|
+
attr_reader :s, :l, :r, :offset, :dict, :final
|
5
|
+
def initialize(s, l, r, offset, dict, final=false)
|
6
|
+
@s = s
|
7
|
+
@l = l
|
8
|
+
@r = r
|
9
|
+
@offset = offset
|
10
|
+
@dict = dict
|
11
|
+
@final = final
|
12
|
+
end
|
20
13
|
|
21
|
-
|
22
|
-
|
23
|
-
|
14
|
+
def update(ch)
|
15
|
+
l = @dict.seek(ch, @l, @r, @offset, :LEFT)
|
16
|
+
return nil unless l
|
17
|
+
r = @dict.seek(ch, l, @r, @offset, :RIGHT)
|
18
|
+
final = (@dict[l].headword.length == @offset + 1)
|
19
|
+
self.class.new(@s, l, r, @offset + 1, @dict, final)
|
20
|
+
end
|
24
21
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
22
|
+
|
23
|
+
module PointersManipulator
|
24
|
+
def new_pointer(i, dict)
|
25
|
+
Pointer.new(i-1, dict.l, dict.r, 0, dict)
|
26
|
+
end
|
27
|
+
|
28
|
+
def transit(pointers, ch)
|
29
|
+
pointers.map{|p| p.update(ch)}.reject(&:nil?)
|
30
|
+
end
|
28
31
|
end
|
29
32
|
end
|
data/wordcut/space_slicer.rb
CHANGED
@@ -1,26 +1,30 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
@s = s
|
6
|
-
@offset = 0
|
7
|
-
@final = false
|
8
|
-
end
|
9
|
-
|
10
|
-
def transit(ch, next_ch)
|
11
|
-
current_is_space = (ch =~ /\s/)
|
12
|
-
next_is_space = (not nil? and next_ch =~ /\s/)
|
1
|
+
module Wordcut
|
2
|
+
|
3
|
+
class SpaceSlicer
|
4
|
+
attr_reader :s, :offset, :final
|
13
5
|
|
14
|
-
|
15
|
-
@
|
16
|
-
elsif current_is_space and not next_is_space
|
17
|
-
@offset += 1
|
18
|
-
@final = true
|
19
|
-
elsif not current_is_space
|
20
|
-
@final = false
|
21
|
-
@s += @offset
|
22
|
-
@s += 1
|
6
|
+
def initialize(s)
|
7
|
+
@s = s
|
23
8
|
@offset = 0
|
9
|
+
@final = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def transit(ch, next_ch)
|
13
|
+
current_is_space = (ch =~ /\s/)
|
14
|
+
next_is_space = (not nil? and next_ch =~ /\s/)
|
15
|
+
|
16
|
+
if current_is_space and next_is_space
|
17
|
+
@offset += 1
|
18
|
+
elsif current_is_space and not next_is_space
|
19
|
+
@offset += 1
|
20
|
+
@final = true
|
21
|
+
elsif not current_is_space
|
22
|
+
@final = false
|
23
|
+
@s += @offset
|
24
|
+
@s += 1
|
25
|
+
@offset = 0
|
26
|
+
end
|
24
27
|
end
|
25
28
|
end
|
29
|
+
|
26
30
|
end
|
data/wordcut/tokenizer.rb
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
require_relative "dag.rb"
|
2
2
|
|
3
|
-
module
|
4
|
-
|
5
|
-
|
3
|
+
module Wordcut
|
4
|
+
module Tokenizer
|
5
|
+
def tokenize(txt)
|
6
|
+
@dag_class.build(@dict, txt).tokens(txt)
|
7
|
+
end
|
6
8
|
end
|
7
|
-
end
|
8
9
|
|
9
|
-
class BasicTokenizer
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
class BasicTokenizer
|
11
|
+
include Tokenizer
|
12
|
+
def initialize(dict)
|
13
|
+
@dict = dict
|
14
|
+
@dag_class = BasicDag
|
15
|
+
end
|
14
16
|
end
|
15
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordcut
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vee Satayamas
|
@@ -17,7 +17,23 @@ executables: []
|
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
+
- LICENSE
|
20
21
|
- README.md
|
22
|
+
- data/tha/tdict-acronyms.txt
|
23
|
+
- data/tha/tdict-city.txt
|
24
|
+
- data/tha/tdict-collection.txt
|
25
|
+
- data/tha/tdict-common.txt
|
26
|
+
- data/tha/tdict-country.txt
|
27
|
+
- data/tha/tdict-district.txt
|
28
|
+
- data/tha/tdict-geo.txt
|
29
|
+
- data/tha/tdict-history.txt
|
30
|
+
- data/tha/tdict-ict.txt
|
31
|
+
- data/tha/tdict-lang-ethnic.txt
|
32
|
+
- data/tha/tdict-proper.txt
|
33
|
+
- data/tha/tdict-science.txt
|
34
|
+
- data/tha/tdict-spell.txt
|
35
|
+
- data/tha/tdict-std-compound.txt
|
36
|
+
- data/tha/tdict-std.txt
|
21
37
|
- wordcut/dag.rb
|
22
38
|
- wordcut/dict.rb
|
23
39
|
- wordcut/dict_seek.rb
|
@@ -26,14 +42,14 @@ files:
|
|
26
42
|
- wordcut/pointer.rb
|
27
43
|
- wordcut/space_slicer.rb
|
28
44
|
- wordcut/tokenizer.rb
|
29
|
-
homepage: https://github.com/veer66/wordcut
|
45
|
+
homepage: https://github.com/veer66/wordcut.rb
|
30
46
|
licenses:
|
31
47
|
- LGPL-3.0
|
32
48
|
metadata: {}
|
33
49
|
post_install_message:
|
34
50
|
rdoc_options: []
|
35
51
|
require_paths:
|
36
|
-
-
|
52
|
+
- "."
|
37
53
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
54
|
requirements:
|
39
55
|
- - ">="
|