wordcut 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +6 -0
- data/wordcut/dag.rb +99 -0
- data/wordcut/dict.rb +51 -0
- data/wordcut/dict_seek.rb +28 -0
- data/wordcut/edge.rb +21 -0
- data/wordcut/edge_builder.rb +18 -0
- data/wordcut/pointer.rb +29 -0
- data/wordcut/space_slicer.rb +26 -0
- data/wordcut/tokenizer.rb +15 -0
- metadata +53 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a8fea8a44ae91d8e478ab2333b1a6972e744fe1d
|
4
|
+
data.tar.gz: bd19ea76b4594f0d889014405cf7255f9b3b3d35
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e712460bc2a4b2030281518eeb8e623eea60520ce4ad9ea80d9e222d0c0c1705bf1ac45224e673f5c07b3859cd3a84ef646a8e20a689e058e4bb6cb0d0dd9926
|
7
|
+
data.tar.gz: 6023f766a7c2e8808daa4c1fe646e9d81b10a0f82694ef3edaf2a14236ecbb85d2b41e0d4ab6324fe38d4733e8c38064733d5fb1b2fe6f478f679c55bc366797
|
data/README.md
ADDED
data/wordcut/dag.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require_relative "edge_builder"
|
2
|
+
require_relative "pointer"
|
3
|
+
require_relative "space_slicer"
|
4
|
+
|
5
|
+
module DictDagUpdater
|
6
|
+
def update_by_dict(i, pointers)
|
7
|
+
edge = self.build_edges(pointers).min
|
8
|
+
self[i] = edge
|
9
|
+
return i
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
module UnkDagUpdater
|
14
|
+
def update_by_unk(i, left)
|
15
|
+
src = self[left]
|
16
|
+
edge = Edge.new(:s => left,
|
17
|
+
:unk => src.unk + 1,
|
18
|
+
:chunk => src.chunk + 1,
|
19
|
+
:etype => :UNK,
|
20
|
+
:payload => nil)
|
21
|
+
self[i] = edge
|
22
|
+
return left
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
module SpaceDagUpdater
|
27
|
+
def update_by_space(i, slicer)
|
28
|
+
s = slicer.s
|
29
|
+
src = self[s]
|
30
|
+
edge = Edge.new(:s => s,
|
31
|
+
:unk => src.unk,
|
32
|
+
:chunk => src.chunk + 1,
|
33
|
+
:etype => :SPACE,
|
34
|
+
:payload => nil)
|
35
|
+
self[i] = edge
|
36
|
+
return i
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
module BasicDagUpdater
|
41
|
+
include DictDagUpdater
|
42
|
+
include UnkDagUpdater
|
43
|
+
include SpaceDagUpdater
|
44
|
+
include PointersManipulator
|
45
|
+
|
46
|
+
def update(i, left, pointers, space_slicer)
|
47
|
+
if not pointers&.empty?
|
48
|
+
update_by_dict(i, pointers)
|
49
|
+
elsif space_slicer&.final
|
50
|
+
update_by_space(i, space_slicer)
|
51
|
+
else
|
52
|
+
update_by_unk(i, left)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
module DagBuilder
|
58
|
+
def build(dict, txt)
|
59
|
+
self[0] = init_edge
|
60
|
+
pointers = []
|
61
|
+
left = 0
|
62
|
+
space_slicer = SpaceSlicer.new(0)
|
63
|
+
for i in 1..txt.length
|
64
|
+
ch = txt[i - 1]
|
65
|
+
next_ch = i < txt.length ? txt[i] : nil
|
66
|
+
space_slicer.transit(ch, next_ch)
|
67
|
+
pointers << new_pointer(i, dict)
|
68
|
+
pointers = transit(pointers, ch)
|
69
|
+
left = update(i, left, pointers.select(&:final), space_slicer)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
module DagToToken
|
75
|
+
def tokens(txt)
|
76
|
+
toks = []
|
77
|
+
i = txt.length
|
78
|
+
while i > 0
|
79
|
+
s = self[i].s
|
80
|
+
tok = txt.slice(s, i-s)
|
81
|
+
toks << tok
|
82
|
+
i = s
|
83
|
+
end
|
84
|
+
toks.reverse
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class BasicDag < Array
|
89
|
+
include EdgeBuilder
|
90
|
+
include BasicDagUpdater
|
91
|
+
include DagBuilder
|
92
|
+
include DagToToken
|
93
|
+
|
94
|
+
def self.build(dict, txt)
|
95
|
+
dag = BasicDag.new(txt.length + 1)
|
96
|
+
dag.build(dict, txt)
|
97
|
+
return dag
|
98
|
+
end
|
99
|
+
end
|
data/wordcut/dict.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative "dict_seek"
|
2
|
+
|
3
|
+
class WordItem
|
4
|
+
attr_reader :headword
|
5
|
+
def initialize(headword)
|
6
|
+
@headword = headword
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module DictInfo
|
11
|
+
def l
|
12
|
+
0
|
13
|
+
end
|
14
|
+
|
15
|
+
def r
|
16
|
+
return nil if self.empty?
|
17
|
+
self.length - 1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module PathResolver
|
22
|
+
def resolve_path(lang, name)
|
23
|
+
File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module BasicDictLoader
|
28
|
+
include PathResolver
|
29
|
+
def load_bundle(lang, name)
|
30
|
+
load(resolve_path(lang, name))
|
31
|
+
end
|
32
|
+
|
33
|
+
def load(path)
|
34
|
+
self.concat(open(path).each_line
|
35
|
+
.map(&:strip)
|
36
|
+
.reject(&:empty?)
|
37
|
+
.map{|w| WordItem.new w})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class BasicDict < Array
|
42
|
+
include DictInfo
|
43
|
+
include DictSeeker
|
44
|
+
include BasicDictLoader
|
45
|
+
|
46
|
+
def self.from_bundle(lang, name)
|
47
|
+
dict = self.new
|
48
|
+
dict.load_bundle(lang, name)
|
49
|
+
return dict
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DictSeeker
|
2
|
+
def seek(ch, l, r, offset, policy)
|
3
|
+
idx = nil
|
4
|
+
while l <= r
|
5
|
+
m = (l + r) / 2
|
6
|
+
w = self[m].headword
|
7
|
+
wlen = w.length
|
8
|
+
|
9
|
+
if wlen <= offset
|
10
|
+
l = m + 1
|
11
|
+
else
|
12
|
+
ch_w = w[offset]
|
13
|
+
if ch_w < ch
|
14
|
+
l = m + 1
|
15
|
+
elsif ch_w > ch
|
16
|
+
r = m - 1
|
17
|
+
elsif policy == :LEFT
|
18
|
+
idx = m
|
19
|
+
r = m - 1
|
20
|
+
elsif policy == :RIGHT
|
21
|
+
idx = m
|
22
|
+
l = m + 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
return idx
|
27
|
+
end
|
28
|
+
end
|
data/wordcut/edge.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
class Edge
|
2
|
+
attr_reader :unk, :chunk, :s, :payload, :etype
|
3
|
+
|
4
|
+
CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
|
5
|
+
|
6
|
+
def initialize(args = {})
|
7
|
+
@unk = args[:unk] || 0
|
8
|
+
@chunk = args[:chunk] || 0
|
9
|
+
@s = args[:s] || 0
|
10
|
+
@payload = args[:payload]
|
11
|
+
@etype = args[:etype]
|
12
|
+
end
|
13
|
+
|
14
|
+
def <=>(o)
|
15
|
+
for fn in CMP_FUNCS
|
16
|
+
cmp = fn.call(self) <=> fn.call(o)
|
17
|
+
return cmp if cmp != 0
|
18
|
+
end
|
19
|
+
return 0
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative "edge.rb"
|
2
|
+
|
3
|
+
module EdgeBuilder
|
4
|
+
def init_edge
|
5
|
+
Edge.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def build_edges(pointers)
|
9
|
+
pointers.map do |pointer|
|
10
|
+
src = self[pointer.s]
|
11
|
+
Edge.new(:s => pointer.s,
|
12
|
+
:unk => src.unk,
|
13
|
+
:chunk => src.chunk + 1,
|
14
|
+
:etype => :DICT,
|
15
|
+
:payload => nil)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/wordcut/pointer.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
class Pointer
|
2
|
+
attr_reader :s, :l, :r, :offset, :dict, :final
|
3
|
+
def initialize(s, l, r, offset, dict, final=false)
|
4
|
+
@s = s
|
5
|
+
@l = l
|
6
|
+
@r = r
|
7
|
+
@offset = offset
|
8
|
+
@dict = dict
|
9
|
+
@final = final
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(ch)
|
13
|
+
l = @dict.seek(ch, @l, @r, @offset, :LEFT)
|
14
|
+
return nil unless l
|
15
|
+
r = @dict.seek(ch, l, @r, @offset, :RIGHT)
|
16
|
+
final = (@dict[l].headword.length == @offset + 1)
|
17
|
+
self.class.new(@s, l, r, @offset + 1, @dict, final)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module PointersManipulator
|
22
|
+
def new_pointer(i, dict)
|
23
|
+
Pointer.new(i-1, dict.l, dict.r, 0, dict)
|
24
|
+
end
|
25
|
+
|
26
|
+
def transit(pointers, ch)
|
27
|
+
pointers.map{|p| p.update(ch)}.reject(&:nil?)
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
class SpaceSlicer
|
2
|
+
attr_reader :s, :offset, :final
|
3
|
+
|
4
|
+
def initialize(s)
|
5
|
+
@s = s
|
6
|
+
@offset = 0
|
7
|
+
@final = false
|
8
|
+
end
|
9
|
+
|
10
|
+
def transit(ch, next_ch)
|
11
|
+
current_is_space = (ch =~ /\s/)
|
12
|
+
next_is_space = (not nil? and next_ch =~ /\s/)
|
13
|
+
|
14
|
+
if current_is_space and next_is_space
|
15
|
+
@offset += 1
|
16
|
+
elsif current_is_space and not next_is_space
|
17
|
+
@offset += 1
|
18
|
+
@final = true
|
19
|
+
elsif not current_is_space
|
20
|
+
@final = false
|
21
|
+
@s += @offset
|
22
|
+
@s += 1
|
23
|
+
@offset = 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wordcut
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vee Satayamas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-05-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Word segmentation tools for ASEAN languages written in Ruby
|
14
|
+
email:
|
15
|
+
- v.satayamas@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- wordcut/dag.rb
|
22
|
+
- wordcut/dict.rb
|
23
|
+
- wordcut/dict_seek.rb
|
24
|
+
- wordcut/edge.rb
|
25
|
+
- wordcut/edge_builder.rb
|
26
|
+
- wordcut/pointer.rb
|
27
|
+
- wordcut/space_slicer.rb
|
28
|
+
- wordcut/tokenizer.rb
|
29
|
+
homepage: https://github.com/veer66/wordcut
|
30
|
+
licenses:
|
31
|
+
- LGPL-3.0
|
32
|
+
metadata: {}
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
require_paths:
|
36
|
+
- wordcut
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 2.3.0
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
requirements: []
|
48
|
+
rubyforge_project:
|
49
|
+
rubygems_version: 2.5.1
|
50
|
+
signing_key:
|
51
|
+
specification_version: 4
|
52
|
+
summary: Word segmetation tools for ASEAN languages
|
53
|
+
test_files: []
|