wordcut 0.0.1 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +165 -0
- data/data/tha/tdict-acronyms.txt +471 -0
- data/data/tha/tdict-city.txt +242 -0
- data/data/tha/tdict-collection.txt +54 -0
- data/data/tha/tdict-common.txt +1234 -0
- data/data/tha/tdict-country.txt +217 -0
- data/data/tha/tdict-district.txt +114 -0
- data/data/tha/tdict-geo.txt +53 -0
- data/data/tha/tdict-history.txt +35 -0
- data/data/tha/tdict-ict.txt +260 -0
- data/data/tha/tdict-lang-ethnic.txt +43 -0
- data/data/tha/tdict-proper.txt +393 -0
- data/data/tha/tdict-science.txt +146 -0
- data/data/tha/tdict-spell.txt +82 -0
- data/data/tha/tdict-std-compound.txt +5493 -0
- data/data/tha/tdict-std.txt +15374 -0
- data/wordcut/dag.rb +82 -78
- data/wordcut/dict.rb +39 -38
- data/wordcut/dict_seek.rb +24 -22
- data/wordcut/edge.rb +21 -17
- data/wordcut/edge_builder.rb +16 -14
- data/wordcut/pointer.rb +27 -24
- data/wordcut/space_slicer.rb +25 -21
- data/wordcut/tokenizer.rb +11 -9
- metadata +19 -3
data/wordcut/dag.rb
CHANGED
@@ -2,98 +2,102 @@ require_relative "edge_builder"
|
|
2
2
|
require_relative "pointer"
|
3
3
|
require_relative "space_slicer"
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
|
6
|
+
module Wordcut
|
7
|
+
module DictDagUpdater
|
8
|
+
def update_by_dict(i, pointers)
|
9
|
+
edge = self.build_edges(pointers).min
|
10
|
+
self[i] = edge
|
11
|
+
return i
|
12
|
+
end
|
10
13
|
end
|
11
|
-
end
|
12
14
|
|
13
|
-
module UnkDagUpdater
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
module UnkDagUpdater
|
16
|
+
def update_by_unk(i, left)
|
17
|
+
src = self[left]
|
18
|
+
edge = Edge.new(:s => left,
|
19
|
+
:unk => src.unk + 1,
|
20
|
+
:chunk => src.chunk + 1,
|
21
|
+
:etype => :UNK,
|
22
|
+
:payload => nil)
|
23
|
+
self[i] = edge
|
24
|
+
return left
|
25
|
+
end
|
23
26
|
end
|
24
|
-
end
|
25
27
|
|
26
|
-
module SpaceDagUpdater
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
module SpaceDagUpdater
|
29
|
+
def update_by_space(i, slicer)
|
30
|
+
s = slicer.s
|
31
|
+
src = self[s]
|
32
|
+
edge = Edge.new(:s => s,
|
33
|
+
:unk => src.unk,
|
34
|
+
:chunk => src.chunk + 1,
|
35
|
+
:etype => :SPACE,
|
36
|
+
:payload => nil)
|
37
|
+
self[i] = edge
|
38
|
+
return i
|
39
|
+
end
|
37
40
|
end
|
38
|
-
end
|
39
41
|
|
40
|
-
module BasicDagUpdater
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
42
|
+
module BasicDagUpdater
|
43
|
+
include DictDagUpdater
|
44
|
+
include UnkDagUpdater
|
45
|
+
include SpaceDagUpdater
|
46
|
+
include PointersManipulator
|
47
|
+
|
48
|
+
def update(i, left, pointers, space_slicer)
|
49
|
+
if not pointers&.empty?
|
50
|
+
update_by_dict(i, pointers)
|
51
|
+
elsif space_slicer&.final
|
52
|
+
update_by_space(i, space_slicer)
|
53
|
+
else
|
54
|
+
update_by_unk(i, left)
|
55
|
+
end
|
53
56
|
end
|
54
57
|
end
|
55
|
-
end
|
56
58
|
|
57
|
-
module DagBuilder
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
59
|
+
module DagBuilder
|
60
|
+
def build(dict, txt)
|
61
|
+
self[0] = init_edge
|
62
|
+
pointers = []
|
63
|
+
left = 0
|
64
|
+
space_slicer = SpaceSlicer.new(0)
|
65
|
+
for i in 1..txt.length
|
66
|
+
ch = txt[i - 1]
|
67
|
+
next_ch = i < txt.length ? txt[i] : nil
|
68
|
+
space_slicer.transit(ch, next_ch)
|
69
|
+
pointers << new_pointer(i, dict)
|
70
|
+
pointers = transit(pointers, ch)
|
71
|
+
left = update(i, left, pointers.select(&:final), space_slicer)
|
72
|
+
end
|
70
73
|
end
|
71
74
|
end
|
72
|
-
end
|
73
75
|
|
74
|
-
module DagToToken
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
76
|
+
module DagToToken
|
77
|
+
def tokens(txt)
|
78
|
+
toks = []
|
79
|
+
i = txt.length
|
80
|
+
while i > 0
|
81
|
+
s = self[i].s
|
82
|
+
tok = txt.slice(s, i-s)
|
83
|
+
toks << tok
|
84
|
+
i = s
|
85
|
+
end
|
86
|
+
toks.reverse
|
83
87
|
end
|
84
|
-
toks.reverse
|
85
88
|
end
|
86
|
-
end
|
87
89
|
|
88
|
-
class BasicDag < Array
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
class BasicDag < Array
|
91
|
+
include EdgeBuilder
|
92
|
+
include BasicDagUpdater
|
93
|
+
include DagBuilder
|
94
|
+
include DagToToken
|
95
|
+
|
96
|
+
def self.build(dict, txt)
|
97
|
+
dag = BasicDag.new(txt.length + 1)
|
98
|
+
dag.build(dict, txt)
|
99
|
+
return dag
|
100
|
+
end
|
98
101
|
end
|
102
|
+
|
99
103
|
end
|
data/wordcut/dict.rb
CHANGED
@@ -1,51 +1,52 @@
|
|
1
1
|
require_relative "dict_seek"
|
2
|
-
|
3
|
-
class WordItem
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
9
|
-
|
10
|
-
module DictInfo
|
11
|
-
def l
|
12
|
-
0
|
2
|
+
module Wordcut
|
3
|
+
class WordItem
|
4
|
+
attr_reader :headword
|
5
|
+
def initialize(headword)
|
6
|
+
@headword = headword
|
7
|
+
end
|
13
8
|
end
|
14
9
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
10
|
+
module DictInfo
|
11
|
+
def l
|
12
|
+
0
|
13
|
+
end
|
20
14
|
|
21
|
-
|
22
|
-
|
23
|
-
|
15
|
+
def r
|
16
|
+
return nil if self.empty?
|
17
|
+
self.length - 1
|
18
|
+
end
|
24
19
|
end
|
25
|
-
end
|
26
20
|
|
27
|
-
module
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
module PathResolver
|
22
|
+
def resolve_path(lang, name)
|
23
|
+
File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
|
24
|
+
end
|
31
25
|
end
|
32
26
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
27
|
+
module BasicDictLoader
|
28
|
+
include PathResolver
|
29
|
+
def load_bundle(lang, name)
|
30
|
+
load(resolve_path(lang, name))
|
31
|
+
end
|
32
|
+
|
33
|
+
def load(path)
|
34
|
+
self.concat(open(path).each_line
|
35
|
+
.map(&:strip)
|
36
|
+
.reject(&:empty?)
|
37
|
+
.map{|w| WordItem.new w})
|
38
|
+
end
|
38
39
|
end
|
39
|
-
end
|
40
40
|
|
41
|
-
class BasicDict < Array
|
42
|
-
|
43
|
-
|
44
|
-
|
41
|
+
class BasicDict < Array
|
42
|
+
include DictInfo
|
43
|
+
include DictSeeker
|
44
|
+
include BasicDictLoader
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def self.from_bundle(lang, name)
|
47
|
+
dict = self.new
|
48
|
+
dict.load_bundle(lang, name)
|
49
|
+
return dict
|
50
|
+
end
|
50
51
|
end
|
51
52
|
end
|
data/wordcut/dict_seek.rb
CHANGED
@@ -1,28 +1,30 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module Wordcut
|
2
|
+
module DictSeeker
|
3
|
+
def seek(ch, l, r, offset, policy)
|
4
|
+
idx = nil
|
5
|
+
while l <= r
|
6
|
+
m = (l + r) / 2
|
7
|
+
w = self[m].headword
|
8
|
+
wlen = w.length
|
8
9
|
|
9
|
-
|
10
|
-
l = m + 1
|
11
|
-
else
|
12
|
-
ch_w = w[offset]
|
13
|
-
if ch_w < ch
|
10
|
+
if wlen <= offset
|
14
11
|
l = m + 1
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
12
|
+
else
|
13
|
+
ch_w = w[offset]
|
14
|
+
if ch_w < ch
|
15
|
+
l = m + 1
|
16
|
+
elsif ch_w > ch
|
17
|
+
r = m - 1
|
18
|
+
elsif policy == :LEFT
|
19
|
+
idx = m
|
20
|
+
r = m - 1
|
21
|
+
elsif policy == :RIGHT
|
22
|
+
idx = m
|
23
|
+
l = m + 1
|
24
|
+
end
|
25
|
+
end
|
24
26
|
end
|
27
|
+
return idx
|
25
28
|
end
|
26
|
-
return idx
|
27
29
|
end
|
28
30
|
end
|
data/wordcut/edge.rb
CHANGED
@@ -1,21 +1,25 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
1
|
+
module Wordcut
|
2
|
+
|
3
|
+
class Edge
|
4
|
+
attr_reader :unk, :chunk, :s, :payload, :etype
|
5
|
+
|
6
|
+
CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
|
7
|
+
|
8
|
+
def initialize(args = {})
|
9
|
+
@unk = args[:unk] || 0
|
10
|
+
@chunk = args[:chunk] || 0
|
11
|
+
@s = args[:s] || 0
|
12
|
+
@payload = args[:payload]
|
13
|
+
@etype = args[:etype]
|
14
|
+
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
def <=>(o)
|
17
|
+
for fn in CMP_FUNCS
|
18
|
+
cmp = fn.call(self) <=> fn.call(o)
|
19
|
+
return cmp if cmp != 0
|
20
|
+
end
|
21
|
+
return 0
|
18
22
|
end
|
19
|
-
return 0
|
20
23
|
end
|
24
|
+
|
21
25
|
end
|
data/wordcut/edge_builder.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
require_relative "edge.rb"
|
2
2
|
|
3
|
-
module
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
pointers
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
module Wordcut
|
4
|
+
module EdgeBuilder
|
5
|
+
def init_edge
|
6
|
+
Edge.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def build_edges(pointers)
|
10
|
+
pointers.map do |pointer|
|
11
|
+
src = self[pointer.s]
|
12
|
+
Edge.new(:s => pointer.s,
|
13
|
+
:unk => src.unk,
|
14
|
+
:chunk => src.chunk + 1,
|
15
|
+
:etype => :DICT,
|
16
|
+
:payload => nil)
|
17
|
+
end
|
18
|
+
end
|
17
19
|
end
|
18
20
|
end
|
data/wordcut/pointer.rb
CHANGED
@@ -1,29 +1,32 @@
|
|
1
|
-
|
2
|
-
attr_reader :s, :l, :r, :offset, :dict, :final
|
3
|
-
def initialize(s, l, r, offset, dict, final=false)
|
4
|
-
@s = s
|
5
|
-
@l = l
|
6
|
-
@r = r
|
7
|
-
@offset = offset
|
8
|
-
@dict = dict
|
9
|
-
@final = final
|
10
|
-
end
|
1
|
+
module Wordcut
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
3
|
+
class Pointer
|
4
|
+
attr_reader :s, :l, :r, :offset, :dict, :final
|
5
|
+
def initialize(s, l, r, offset, dict, final=false)
|
6
|
+
@s = s
|
7
|
+
@l = l
|
8
|
+
@r = r
|
9
|
+
@offset = offset
|
10
|
+
@dict = dict
|
11
|
+
@final = final
|
12
|
+
end
|
20
13
|
|
21
|
-
|
22
|
-
|
23
|
-
|
14
|
+
def update(ch)
|
15
|
+
l = @dict.seek(ch, @l, @r, @offset, :LEFT)
|
16
|
+
return nil unless l
|
17
|
+
r = @dict.seek(ch, l, @r, @offset, :RIGHT)
|
18
|
+
final = (@dict[l].headword.length == @offset + 1)
|
19
|
+
self.class.new(@s, l, r, @offset + 1, @dict, final)
|
20
|
+
end
|
24
21
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
22
|
+
|
23
|
+
module PointersManipulator
|
24
|
+
def new_pointer(i, dict)
|
25
|
+
Pointer.new(i-1, dict.l, dict.r, 0, dict)
|
26
|
+
end
|
27
|
+
|
28
|
+
def transit(pointers, ch)
|
29
|
+
pointers.map{|p| p.update(ch)}.reject(&:nil?)
|
30
|
+
end
|
28
31
|
end
|
29
32
|
end
|
data/wordcut/space_slicer.rb
CHANGED
@@ -1,26 +1,30 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
@s = s
|
6
|
-
@offset = 0
|
7
|
-
@final = false
|
8
|
-
end
|
9
|
-
|
10
|
-
def transit(ch, next_ch)
|
11
|
-
current_is_space = (ch =~ /\s/)
|
12
|
-
next_is_space = (not nil? and next_ch =~ /\s/)
|
1
|
+
module Wordcut
|
2
|
+
|
3
|
+
class SpaceSlicer
|
4
|
+
attr_reader :s, :offset, :final
|
13
5
|
|
14
|
-
|
15
|
-
@
|
16
|
-
elsif current_is_space and not next_is_space
|
17
|
-
@offset += 1
|
18
|
-
@final = true
|
19
|
-
elsif not current_is_space
|
20
|
-
@final = false
|
21
|
-
@s += @offset
|
22
|
-
@s += 1
|
6
|
+
def initialize(s)
|
7
|
+
@s = s
|
23
8
|
@offset = 0
|
9
|
+
@final = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def transit(ch, next_ch)
|
13
|
+
current_is_space = (ch =~ /\s/)
|
14
|
+
next_is_space = (not nil? and next_ch =~ /\s/)
|
15
|
+
|
16
|
+
if current_is_space and next_is_space
|
17
|
+
@offset += 1
|
18
|
+
elsif current_is_space and not next_is_space
|
19
|
+
@offset += 1
|
20
|
+
@final = true
|
21
|
+
elsif not current_is_space
|
22
|
+
@final = false
|
23
|
+
@s += @offset
|
24
|
+
@s += 1
|
25
|
+
@offset = 0
|
26
|
+
end
|
24
27
|
end
|
25
28
|
end
|
29
|
+
|
26
30
|
end
|
data/wordcut/tokenizer.rb
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
require_relative "dag.rb"
|
2
2
|
|
3
|
-
module
|
4
|
-
|
5
|
-
|
3
|
+
module Wordcut
|
4
|
+
module Tokenizer
|
5
|
+
def tokenize(txt)
|
6
|
+
@dag_class.build(@dict, txt).tokens(txt)
|
7
|
+
end
|
6
8
|
end
|
7
|
-
end
|
8
9
|
|
9
|
-
class BasicTokenizer
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
class BasicTokenizer
|
11
|
+
include Tokenizer
|
12
|
+
def initialize(dict)
|
13
|
+
@dict = dict
|
14
|
+
@dag_class = BasicDag
|
15
|
+
end
|
14
16
|
end
|
15
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordcut
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vee Satayamas
|
@@ -17,7 +17,23 @@ executables: []
|
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
+
- LICENSE
|
20
21
|
- README.md
|
22
|
+
- data/tha/tdict-acronyms.txt
|
23
|
+
- data/tha/tdict-city.txt
|
24
|
+
- data/tha/tdict-collection.txt
|
25
|
+
- data/tha/tdict-common.txt
|
26
|
+
- data/tha/tdict-country.txt
|
27
|
+
- data/tha/tdict-district.txt
|
28
|
+
- data/tha/tdict-geo.txt
|
29
|
+
- data/tha/tdict-history.txt
|
30
|
+
- data/tha/tdict-ict.txt
|
31
|
+
- data/tha/tdict-lang-ethnic.txt
|
32
|
+
- data/tha/tdict-proper.txt
|
33
|
+
- data/tha/tdict-science.txt
|
34
|
+
- data/tha/tdict-spell.txt
|
35
|
+
- data/tha/tdict-std-compound.txt
|
36
|
+
- data/tha/tdict-std.txt
|
21
37
|
- wordcut/dag.rb
|
22
38
|
- wordcut/dict.rb
|
23
39
|
- wordcut/dict_seek.rb
|
@@ -26,14 +42,14 @@ files:
|
|
26
42
|
- wordcut/pointer.rb
|
27
43
|
- wordcut/space_slicer.rb
|
28
44
|
- wordcut/tokenizer.rb
|
29
|
-
homepage: https://github.com/veer66/wordcut
|
45
|
+
homepage: https://github.com/veer66/wordcut.rb
|
30
46
|
licenses:
|
31
47
|
- LGPL-3.0
|
32
48
|
metadata: {}
|
33
49
|
post_install_message:
|
34
50
|
rdoc_options: []
|
35
51
|
require_paths:
|
36
|
-
-
|
52
|
+
- "."
|
37
53
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
54
|
requirements:
|
39
55
|
- - ">="
|