wordcut 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +6 -0
- data/wordcut/dag.rb +99 -0
- data/wordcut/dict.rb +51 -0
- data/wordcut/dict_seek.rb +28 -0
- data/wordcut/edge.rb +21 -0
- data/wordcut/edge_builder.rb +18 -0
- data/wordcut/pointer.rb +29 -0
- data/wordcut/space_slicer.rb +26 -0
- data/wordcut/tokenizer.rb +15 -0
- metadata +53 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a8fea8a44ae91d8e478ab2333b1a6972e744fe1d
|
4
|
+
data.tar.gz: bd19ea76b4594f0d889014405cf7255f9b3b3d35
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e712460bc2a4b2030281518eeb8e623eea60520ce4ad9ea80d9e222d0c0c1705bf1ac45224e673f5c07b3859cd3a84ef646a8e20a689e058e4bb6cb0d0dd9926
|
7
|
+
data.tar.gz: 6023f766a7c2e8808daa4c1fe646e9d81b10a0f82694ef3edaf2a14236ecbb85d2b41e0d4ab6324fe38d4733e8c38064733d5fb1b2fe6f478f679c55bc366797
|
data/README.md
ADDED
data/wordcut/dag.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require_relative "edge_builder"
|
2
|
+
require_relative "pointer"
|
3
|
+
require_relative "space_slicer"
|
4
|
+
|
5
|
+
module DictDagUpdater
|
6
|
+
def update_by_dict(i, pointers)
|
7
|
+
edge = self.build_edges(pointers).min
|
8
|
+
self[i] = edge
|
9
|
+
return i
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
module UnkDagUpdater
|
14
|
+
def update_by_unk(i, left)
|
15
|
+
src = self[left]
|
16
|
+
edge = Edge.new(:s => left,
|
17
|
+
:unk => src.unk + 1,
|
18
|
+
:chunk => src.chunk + 1,
|
19
|
+
:etype => :UNK,
|
20
|
+
:payload => nil)
|
21
|
+
self[i] = edge
|
22
|
+
return left
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
module SpaceDagUpdater
|
27
|
+
def update_by_space(i, slicer)
|
28
|
+
s = slicer.s
|
29
|
+
src = self[s]
|
30
|
+
edge = Edge.new(:s => s,
|
31
|
+
:unk => src.unk,
|
32
|
+
:chunk => src.chunk + 1,
|
33
|
+
:etype => :SPACE,
|
34
|
+
:payload => nil)
|
35
|
+
self[i] = edge
|
36
|
+
return i
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
module BasicDagUpdater
|
41
|
+
include DictDagUpdater
|
42
|
+
include UnkDagUpdater
|
43
|
+
include SpaceDagUpdater
|
44
|
+
include PointersManipulator
|
45
|
+
|
46
|
+
def update(i, left, pointers, space_slicer)
|
47
|
+
if not pointers&.empty?
|
48
|
+
update_by_dict(i, pointers)
|
49
|
+
elsif space_slicer&.final
|
50
|
+
update_by_space(i, space_slicer)
|
51
|
+
else
|
52
|
+
update_by_unk(i, left)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
module DagBuilder
|
58
|
+
def build(dict, txt)
|
59
|
+
self[0] = init_edge
|
60
|
+
pointers = []
|
61
|
+
left = 0
|
62
|
+
space_slicer = SpaceSlicer.new(0)
|
63
|
+
for i in 1..txt.length
|
64
|
+
ch = txt[i - 1]
|
65
|
+
next_ch = i < txt.length ? txt[i] : nil
|
66
|
+
space_slicer.transit(ch, next_ch)
|
67
|
+
pointers << new_pointer(i, dict)
|
68
|
+
pointers = transit(pointers, ch)
|
69
|
+
left = update(i, left, pointers.select(&:final), space_slicer)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
module DagToToken
|
75
|
+
def tokens(txt)
|
76
|
+
toks = []
|
77
|
+
i = txt.length
|
78
|
+
while i > 0
|
79
|
+
s = self[i].s
|
80
|
+
tok = txt.slice(s, i-s)
|
81
|
+
toks << tok
|
82
|
+
i = s
|
83
|
+
end
|
84
|
+
toks.reverse
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class BasicDag < Array
|
89
|
+
include EdgeBuilder
|
90
|
+
include BasicDagUpdater
|
91
|
+
include DagBuilder
|
92
|
+
include DagToToken
|
93
|
+
|
94
|
+
def self.build(dict, txt)
|
95
|
+
dag = BasicDag.new(txt.length + 1)
|
96
|
+
dag.build(dict, txt)
|
97
|
+
return dag
|
98
|
+
end
|
99
|
+
end
|
data/wordcut/dict.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative "dict_seek"
|
2
|
+
|
3
|
+
class WordItem
|
4
|
+
attr_reader :headword
|
5
|
+
def initialize(headword)
|
6
|
+
@headword = headword
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module DictInfo
|
11
|
+
def l
|
12
|
+
0
|
13
|
+
end
|
14
|
+
|
15
|
+
def r
|
16
|
+
return nil if self.empty?
|
17
|
+
self.length - 1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module PathResolver
|
22
|
+
def resolve_path(lang, name)
|
23
|
+
File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module BasicDictLoader
|
28
|
+
include PathResolver
|
29
|
+
def load_bundle(lang, name)
|
30
|
+
load(resolve_path(lang, name))
|
31
|
+
end
|
32
|
+
|
33
|
+
def load(path)
|
34
|
+
self.concat(open(path).each_line
|
35
|
+
.map(&:strip)
|
36
|
+
.reject(&:empty?)
|
37
|
+
.map{|w| WordItem.new w})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class BasicDict < Array
|
42
|
+
include DictInfo
|
43
|
+
include DictSeeker
|
44
|
+
include BasicDictLoader
|
45
|
+
|
46
|
+
def self.from_bundle(lang, name)
|
47
|
+
dict = self.new
|
48
|
+
dict.load_bundle(lang, name)
|
49
|
+
return dict
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DictSeeker
|
2
|
+
def seek(ch, l, r, offset, policy)
|
3
|
+
idx = nil
|
4
|
+
while l <= r
|
5
|
+
m = (l + r) / 2
|
6
|
+
w = self[m].headword
|
7
|
+
wlen = w.length
|
8
|
+
|
9
|
+
if wlen <= offset
|
10
|
+
l = m + 1
|
11
|
+
else
|
12
|
+
ch_w = w[offset]
|
13
|
+
if ch_w < ch
|
14
|
+
l = m + 1
|
15
|
+
elsif ch_w > ch
|
16
|
+
r = m - 1
|
17
|
+
elsif policy == :LEFT
|
18
|
+
idx = m
|
19
|
+
r = m - 1
|
20
|
+
elsif policy == :RIGHT
|
21
|
+
idx = m
|
22
|
+
l = m + 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
return idx
|
27
|
+
end
|
28
|
+
end
|
data/wordcut/edge.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
class Edge
|
2
|
+
attr_reader :unk, :chunk, :s, :payload, :etype
|
3
|
+
|
4
|
+
CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
|
5
|
+
|
6
|
+
def initialize(args = {})
|
7
|
+
@unk = args[:unk] || 0
|
8
|
+
@chunk = args[:chunk] || 0
|
9
|
+
@s = args[:s] || 0
|
10
|
+
@payload = args[:payload]
|
11
|
+
@etype = args[:etype]
|
12
|
+
end
|
13
|
+
|
14
|
+
def <=>(o)
|
15
|
+
for fn in CMP_FUNCS
|
16
|
+
cmp = fn.call(self) <=> fn.call(o)
|
17
|
+
return cmp if cmp != 0
|
18
|
+
end
|
19
|
+
return 0
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative "edge.rb"
|
2
|
+
|
3
|
+
module EdgeBuilder
|
4
|
+
def init_edge
|
5
|
+
Edge.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def build_edges(pointers)
|
9
|
+
pointers.map do |pointer|
|
10
|
+
src = self[pointer.s]
|
11
|
+
Edge.new(:s => pointer.s,
|
12
|
+
:unk => src.unk,
|
13
|
+
:chunk => src.chunk + 1,
|
14
|
+
:etype => :DICT,
|
15
|
+
:payload => nil)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/wordcut/pointer.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
class Pointer
|
2
|
+
attr_reader :s, :l, :r, :offset, :dict, :final
|
3
|
+
def initialize(s, l, r, offset, dict, final=false)
|
4
|
+
@s = s
|
5
|
+
@l = l
|
6
|
+
@r = r
|
7
|
+
@offset = offset
|
8
|
+
@dict = dict
|
9
|
+
@final = final
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(ch)
|
13
|
+
l = @dict.seek(ch, @l, @r, @offset, :LEFT)
|
14
|
+
return nil unless l
|
15
|
+
r = @dict.seek(ch, l, @r, @offset, :RIGHT)
|
16
|
+
final = (@dict[l].headword.length == @offset + 1)
|
17
|
+
self.class.new(@s, l, r, @offset + 1, @dict, final)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module PointersManipulator
|
22
|
+
def new_pointer(i, dict)
|
23
|
+
Pointer.new(i-1, dict.l, dict.r, 0, dict)
|
24
|
+
end
|
25
|
+
|
26
|
+
def transit(pointers, ch)
|
27
|
+
pointers.map{|p| p.update(ch)}.reject(&:nil?)
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
class SpaceSlicer
|
2
|
+
attr_reader :s, :offset, :final
|
3
|
+
|
4
|
+
def initialize(s)
|
5
|
+
@s = s
|
6
|
+
@offset = 0
|
7
|
+
@final = false
|
8
|
+
end
|
9
|
+
|
10
|
+
def transit(ch, next_ch)
|
11
|
+
current_is_space = (ch =~ /\s/)
|
12
|
+
next_is_space = (not nil? and next_ch =~ /\s/)
|
13
|
+
|
14
|
+
if current_is_space and next_is_space
|
15
|
+
@offset += 1
|
16
|
+
elsif current_is_space and not next_is_space
|
17
|
+
@offset += 1
|
18
|
+
@final = true
|
19
|
+
elsif not current_is_space
|
20
|
+
@final = false
|
21
|
+
@s += @offset
|
22
|
+
@s += 1
|
23
|
+
@offset = 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wordcut
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vee Satayamas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-05-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Word segmentation tools for ASEAN languages written in Ruby
|
14
|
+
email:
|
15
|
+
- v.satayamas@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- wordcut/dag.rb
|
22
|
+
- wordcut/dict.rb
|
23
|
+
- wordcut/dict_seek.rb
|
24
|
+
- wordcut/edge.rb
|
25
|
+
- wordcut/edge_builder.rb
|
26
|
+
- wordcut/pointer.rb
|
27
|
+
- wordcut/space_slicer.rb
|
28
|
+
- wordcut/tokenizer.rb
|
29
|
+
homepage: https://github.com/veer66/wordcut
|
30
|
+
licenses:
|
31
|
+
- LGPL-3.0
|
32
|
+
metadata: {}
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
require_paths:
|
36
|
+
- wordcut
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 2.3.0
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
requirements: []
|
48
|
+
rubyforge_project:
|
49
|
+
rubygems_version: 2.5.1
|
50
|
+
signing_key:
|
51
|
+
specification_version: 4
|
52
|
+
summary: Word segmetation tools for ASEAN languages
|
53
|
+
test_files: []
|