wordcut 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a8fea8a44ae91d8e478ab2333b1a6972e744fe1d
4
+ data.tar.gz: bd19ea76b4594f0d889014405cf7255f9b3b3d35
5
+ SHA512:
6
+ metadata.gz: e712460bc2a4b2030281518eeb8e623eea60520ce4ad9ea80d9e222d0c0c1705bf1ac45224e673f5c07b3859cd3a84ef646a8e20a689e058e4bb6cb0d0dd9926
7
+ data.tar.gz: 6023f766a7c2e8808daa4c1fe646e9d81b10a0f82694ef3edaf2a14236ecbb85d2b41e0d4ab6324fe38d4733e8c38064733d5fb1b2fe6f478f679c55bc366797
data/README.md ADDED
@@ -0,0 +1,6 @@
1
+ # wordcut.rb
2
+ ASEAN word tokenizer written in Ruby.
3
+
4
+ ## Status: pre-alpha
5
+
6
+ ## The API is subject to change.
data/wordcut/dag.rb ADDED
@@ -0,0 +1,99 @@
1
+ require_relative "edge_builder"
2
+ require_relative "pointer"
3
+ require_relative "space_slicer"
4
+
5
+ module DictDagUpdater
6
+ def update_by_dict(i, pointers)
7
+ edge = self.build_edges(pointers).min
8
+ self[i] = edge
9
+ return i
10
+ end
11
+ end
12
+
13
+ module UnkDagUpdater
14
+ def update_by_unk(i, left)
15
+ src = self[left]
16
+ edge = Edge.new(:s => left,
17
+ :unk => src.unk + 1,
18
+ :chunk => src.chunk + 1,
19
+ :etype => :UNK,
20
+ :payload => nil)
21
+ self[i] = edge
22
+ return left
23
+ end
24
+ end
25
+
26
+ module SpaceDagUpdater
27
+ def update_by_space(i, slicer)
28
+ s = slicer.s
29
+ src = self[s]
30
+ edge = Edge.new(:s => s,
31
+ :unk => src.unk,
32
+ :chunk => src.chunk + 1,
33
+ :etype => :SPACE,
34
+ :payload => nil)
35
+ self[i] = edge
36
+ return i
37
+ end
38
+ end
39
+
40
+ module BasicDagUpdater
41
+ include DictDagUpdater
42
+ include UnkDagUpdater
43
+ include SpaceDagUpdater
44
+ include PointersManipulator
45
+
46
+ def update(i, left, pointers, space_slicer)
47
+ if not pointers&.empty?
48
+ update_by_dict(i, pointers)
49
+ elsif space_slicer&.final
50
+ update_by_space(i, space_slicer)
51
+ else
52
+ update_by_unk(i, left)
53
+ end
54
+ end
55
+ end
56
+
57
+ module DagBuilder
58
+ def build(dict, txt)
59
+ self[0] = init_edge
60
+ pointers = []
61
+ left = 0
62
+ space_slicer = SpaceSlicer.new(0)
63
+ for i in 1..txt.length
64
+ ch = txt[i - 1]
65
+ next_ch = i < txt.length ? txt[i] : nil
66
+ space_slicer.transit(ch, next_ch)
67
+ pointers << new_pointer(i, dict)
68
+ pointers = transit(pointers, ch)
69
+ left = update(i, left, pointers.select(&:final), space_slicer)
70
+ end
71
+ end
72
+ end
73
+
74
+ module DagToToken
75
+ def tokens(txt)
76
+ toks = []
77
+ i = txt.length
78
+ while i > 0
79
+ s = self[i].s
80
+ tok = txt.slice(s, i-s)
81
+ toks << tok
82
+ i = s
83
+ end
84
+ toks.reverse
85
+ end
86
+ end
87
+
88
+ class BasicDag < Array
89
+ include EdgeBuilder
90
+ include BasicDagUpdater
91
+ include DagBuilder
92
+ include DagToToken
93
+
94
+ def self.build(dict, txt)
95
+ dag = BasicDag.new(txt.length + 1)
96
+ dag.build(dict, txt)
97
+ return dag
98
+ end
99
+ end
data/wordcut/dict.rb ADDED
@@ -0,0 +1,51 @@
1
+ require_relative "dict_seek"
2
+
3
+ class WordItem
4
+ attr_reader :headword
5
+ def initialize(headword)
6
+ @headword = headword
7
+ end
8
+ end
9
+
10
+ module DictInfo
11
+ def l
12
+ 0
13
+ end
14
+
15
+ def r
16
+ return nil if self.empty?
17
+ self.length - 1
18
+ end
19
+ end
20
+
21
+ module PathResolver
22
+ def resolve_path(lang, name)
23
+ File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
24
+ end
25
+ end
26
+
27
+ module BasicDictLoader
28
+ include PathResolver
29
+ def load_bundle(lang, name)
30
+ load(resolve_path(lang, name))
31
+ end
32
+
33
+ def load(path)
34
+ self.concat(open(path).each_line
35
+ .map(&:strip)
36
+ .reject(&:empty?)
37
+ .map{|w| WordItem.new w})
38
+ end
39
+ end
40
+
41
+ class BasicDict < Array
42
+ include DictInfo
43
+ include DictSeeker
44
+ include BasicDictLoader
45
+
46
+ def self.from_bundle(lang, name)
47
+ dict = self.new
48
+ dict.load_bundle(lang, name)
49
+ return dict
50
+ end
51
+ end
@@ -0,0 +1,28 @@
1
+ module DictSeeker
2
+ def seek(ch, l, r, offset, policy)
3
+ idx = nil
4
+ while l <= r
5
+ m = (l + r) / 2
6
+ w = self[m].headword
7
+ wlen = w.length
8
+
9
+ if wlen <= offset
10
+ l = m + 1
11
+ else
12
+ ch_w = w[offset]
13
+ if ch_w < ch
14
+ l = m + 1
15
+ elsif ch_w > ch
16
+ r = m - 1
17
+ elsif policy == :LEFT
18
+ idx = m
19
+ r = m - 1
20
+ elsif policy == :RIGHT
21
+ idx = m
22
+ l = m + 1
23
+ end
24
+ end
25
+ end
26
+ return idx
27
+ end
28
+ end
data/wordcut/edge.rb ADDED
@@ -0,0 +1,21 @@
1
+ class Edge
2
+ attr_reader :unk, :chunk, :s, :payload, :etype
3
+
4
+ CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
5
+
6
+ def initialize(args = {})
7
+ @unk = args[:unk] || 0
8
+ @chunk = args[:chunk] || 0
9
+ @s = args[:s] || 0
10
+ @payload = args[:payload]
11
+ @etype = args[:etype]
12
+ end
13
+
14
+ def <=>(o)
15
+ for fn in CMP_FUNCS
16
+ cmp = fn.call(self) <=> fn.call(o)
17
+ return cmp if cmp != 0
18
+ end
19
+ return 0
20
+ end
21
+ end
@@ -0,0 +1,18 @@
1
+ require_relative "edge.rb"
2
+
3
+ module EdgeBuilder
4
+ def init_edge
5
+ Edge.new
6
+ end
7
+
8
+ def build_edges(pointers)
9
+ pointers.map do |pointer|
10
+ src = self[pointer.s]
11
+ Edge.new(:s => pointer.s,
12
+ :unk => src.unk,
13
+ :chunk => src.chunk + 1,
14
+ :etype => :DICT,
15
+ :payload => nil)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,29 @@
1
+ class Pointer
2
+ attr_reader :s, :l, :r, :offset, :dict, :final
3
+ def initialize(s, l, r, offset, dict, final=false)
4
+ @s = s
5
+ @l = l
6
+ @r = r
7
+ @offset = offset
8
+ @dict = dict
9
+ @final = final
10
+ end
11
+
12
+ def update(ch)
13
+ l = @dict.seek(ch, @l, @r, @offset, :LEFT)
14
+ return nil unless l
15
+ r = @dict.seek(ch, l, @r, @offset, :RIGHT)
16
+ final = (@dict[l].headword.length == @offset + 1)
17
+ self.class.new(@s, l, r, @offset + 1, @dict, final)
18
+ end
19
+ end
20
+
21
+ module PointersManipulator
22
+ def new_pointer(i, dict)
23
+ Pointer.new(i-1, dict.l, dict.r, 0, dict)
24
+ end
25
+
26
+ def transit(pointers, ch)
27
+ pointers.map{|p| p.update(ch)}.reject(&:nil?)
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ class SpaceSlicer
2
+ attr_reader :s, :offset, :final
3
+
4
+ def initialize(s)
5
+ @s = s
6
+ @offset = 0
7
+ @final = false
8
+ end
9
+
10
+ def transit(ch, next_ch)
11
+ current_is_space = (ch =~ /\s/)
12
+ next_is_space = (not nil? and next_ch =~ /\s/)
13
+
14
+ if current_is_space and next_is_space
15
+ @offset += 1
16
+ elsif current_is_space and not next_is_space
17
+ @offset += 1
18
+ @final = true
19
+ elsif not current_is_space
20
+ @final = false
21
+ @s += @offset
22
+ @s += 1
23
+ @offset = 0
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,15 @@
1
+ require_relative "dag.rb"
2
+
3
+ module Tokenizer
4
+ def tokenize(txt)
5
+ @dag_class.build(@dict, txt).tokens(txt)
6
+ end
7
+ end
8
+
9
+ class BasicTokenizer
10
+ include Tokenizer
11
+ def initialize(dict)
12
+ @dict = dict
13
+ @dag_class = BasicDag
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wordcut
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Vee Satayamas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-05-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Word segmentation tools for ASEAN languages written in Ruby
14
+ email:
15
+ - v.satayamas@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - wordcut/dag.rb
22
+ - wordcut/dict.rb
23
+ - wordcut/dict_seek.rb
24
+ - wordcut/edge.rb
25
+ - wordcut/edge_builder.rb
26
+ - wordcut/pointer.rb
27
+ - wordcut/space_slicer.rb
28
+ - wordcut/tokenizer.rb
29
+ homepage: https://github.com/veer66/wordcut
30
+ licenses:
31
+ - LGPL-3.0
32
+ metadata: {}
33
+ post_install_message:
34
+ rdoc_options: []
35
+ require_paths:
36
+ - wordcut
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: 2.3.0
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 2.5.1
50
+ signing_key:
51
+ specification_version: 4
52
+ summary: Word segmetation tools for ASEAN languages
53
+ test_files: []