wordcut 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a8fea8a44ae91d8e478ab2333b1a6972e744fe1d
4
+ data.tar.gz: bd19ea76b4594f0d889014405cf7255f9b3b3d35
5
+ SHA512:
6
+ metadata.gz: e712460bc2a4b2030281518eeb8e623eea60520ce4ad9ea80d9e222d0c0c1705bf1ac45224e673f5c07b3859cd3a84ef646a8e20a689e058e4bb6cb0d0dd9926
7
+ data.tar.gz: 6023f766a7c2e8808daa4c1fe646e9d81b10a0f82694ef3edaf2a14236ecbb85d2b41e0d4ab6324fe38d4733e8c38064733d5fb1b2fe6f478f679c55bc366797
data/README.md ADDED
@@ -0,0 +1,6 @@
1
+ # wordcut.rb
2
+ ASEAN word tokenizer written in Ruby.
3
+
4
+ ## Status: pre-alpha
5
+
6
+ ## The API is subject to change.
data/wordcut/dag.rb ADDED
@@ -0,0 +1,99 @@
1
+ require_relative "edge_builder"
2
+ require_relative "pointer"
3
+ require_relative "space_slicer"
4
+
5
+ module DictDagUpdater
6
+ def update_by_dict(i, pointers)
7
+ edge = self.build_edges(pointers).min
8
+ self[i] = edge
9
+ return i
10
+ end
11
+ end
12
+
13
+ module UnkDagUpdater
14
+ def update_by_unk(i, left)
15
+ src = self[left]
16
+ edge = Edge.new(:s => left,
17
+ :unk => src.unk + 1,
18
+ :chunk => src.chunk + 1,
19
+ :etype => :UNK,
20
+ :payload => nil)
21
+ self[i] = edge
22
+ return left
23
+ end
24
+ end
25
+
26
+ module SpaceDagUpdater
27
+ def update_by_space(i, slicer)
28
+ s = slicer.s
29
+ src = self[s]
30
+ edge = Edge.new(:s => s,
31
+ :unk => src.unk,
32
+ :chunk => src.chunk + 1,
33
+ :etype => :SPACE,
34
+ :payload => nil)
35
+ self[i] = edge
36
+ return i
37
+ end
38
+ end
39
+
40
+ module BasicDagUpdater
41
+ include DictDagUpdater
42
+ include UnkDagUpdater
43
+ include SpaceDagUpdater
44
+ include PointersManipulator
45
+
46
+ def update(i, left, pointers, space_slicer)
47
+ if not pointers&.empty?
48
+ update_by_dict(i, pointers)
49
+ elsif space_slicer&.final
50
+ update_by_space(i, space_slicer)
51
+ else
52
+ update_by_unk(i, left)
53
+ end
54
+ end
55
+ end
56
+
57
+ module DagBuilder
58
+ def build(dict, txt)
59
+ self[0] = init_edge
60
+ pointers = []
61
+ left = 0
62
+ space_slicer = SpaceSlicer.new(0)
63
+ for i in 1..txt.length
64
+ ch = txt[i - 1]
65
+ next_ch = i < txt.length ? txt[i] : nil
66
+ space_slicer.transit(ch, next_ch)
67
+ pointers << new_pointer(i, dict)
68
+ pointers = transit(pointers, ch)
69
+ left = update(i, left, pointers.select(&:final), space_slicer)
70
+ end
71
+ end
72
+ end
73
+
74
+ module DagToToken
75
+ def tokens(txt)
76
+ toks = []
77
+ i = txt.length
78
+ while i > 0
79
+ s = self[i].s
80
+ tok = txt.slice(s, i-s)
81
+ toks << tok
82
+ i = s
83
+ end
84
+ toks.reverse
85
+ end
86
+ end
87
+
88
+ class BasicDag < Array
89
+ include EdgeBuilder
90
+ include BasicDagUpdater
91
+ include DagBuilder
92
+ include DagToToken
93
+
94
+ def self.build(dict, txt)
95
+ dag = BasicDag.new(txt.length + 1)
96
+ dag.build(dict, txt)
97
+ return dag
98
+ end
99
+ end
data/wordcut/dict.rb ADDED
@@ -0,0 +1,51 @@
1
+ require_relative "dict_seek"
2
+
3
+ class WordItem
4
+ attr_reader :headword
5
+ def initialize(headword)
6
+ @headword = headword
7
+ end
8
+ end
9
+
10
+ module DictInfo
11
+ def l
12
+ 0
13
+ end
14
+
15
+ def r
16
+ return nil if self.empty?
17
+ self.length - 1
18
+ end
19
+ end
20
+
21
+ module PathResolver
22
+ def resolve_path(lang, name)
23
+ File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
24
+ end
25
+ end
26
+
27
+ module BasicDictLoader
28
+ include PathResolver
29
+ def load_bundle(lang, name)
30
+ load(resolve_path(lang, name))
31
+ end
32
+
33
+ def load(path)
34
+ self.concat(open(path).each_line
35
+ .map(&:strip)
36
+ .reject(&:empty?)
37
+ .map{|w| WordItem.new w})
38
+ end
39
+ end
40
+
41
+ class BasicDict < Array
42
+ include DictInfo
43
+ include DictSeeker
44
+ include BasicDictLoader
45
+
46
+ def self.from_bundle(lang, name)
47
+ dict = self.new
48
+ dict.load_bundle(lang, name)
49
+ return dict
50
+ end
51
+ end
@@ -0,0 +1,28 @@
1
+ module DictSeeker
2
+ def seek(ch, l, r, offset, policy)
3
+ idx = nil
4
+ while l <= r
5
+ m = (l + r) / 2
6
+ w = self[m].headword
7
+ wlen = w.length
8
+
9
+ if wlen <= offset
10
+ l = m + 1
11
+ else
12
+ ch_w = w[offset]
13
+ if ch_w < ch
14
+ l = m + 1
15
+ elsif ch_w > ch
16
+ r = m - 1
17
+ elsif policy == :LEFT
18
+ idx = m
19
+ r = m - 1
20
+ elsif policy == :RIGHT
21
+ idx = m
22
+ l = m + 1
23
+ end
24
+ end
25
+ end
26
+ return idx
27
+ end
28
+ end
data/wordcut/edge.rb ADDED
@@ -0,0 +1,21 @@
1
+ class Edge
2
+ attr_reader :unk, :chunk, :s, :payload, :etype
3
+
4
+ CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
5
+
6
+ def initialize(args = {})
7
+ @unk = args[:unk] || 0
8
+ @chunk = args[:chunk] || 0
9
+ @s = args[:s] || 0
10
+ @payload = args[:payload]
11
+ @etype = args[:etype]
12
+ end
13
+
14
+ def <=>(o)
15
+ for fn in CMP_FUNCS
16
+ cmp = fn.call(self) <=> fn.call(o)
17
+ return cmp if cmp != 0
18
+ end
19
+ return 0
20
+ end
21
+ end
@@ -0,0 +1,18 @@
1
+ require_relative "edge.rb"
2
+
3
+ module EdgeBuilder
4
+ def init_edge
5
+ Edge.new
6
+ end
7
+
8
+ def build_edges(pointers)
9
+ pointers.map do |pointer|
10
+ src = self[pointer.s]
11
+ Edge.new(:s => pointer.s,
12
+ :unk => src.unk,
13
+ :chunk => src.chunk + 1,
14
+ :etype => :DICT,
15
+ :payload => nil)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,29 @@
1
+ class Pointer
2
+ attr_reader :s, :l, :r, :offset, :dict, :final
3
+ def initialize(s, l, r, offset, dict, final=false)
4
+ @s = s
5
+ @l = l
6
+ @r = r
7
+ @offset = offset
8
+ @dict = dict
9
+ @final = final
10
+ end
11
+
12
+ def update(ch)
13
+ l = @dict.seek(ch, @l, @r, @offset, :LEFT)
14
+ return nil unless l
15
+ r = @dict.seek(ch, l, @r, @offset, :RIGHT)
16
+ final = (@dict[l].headword.length == @offset + 1)
17
+ self.class.new(@s, l, r, @offset + 1, @dict, final)
18
+ end
19
+ end
20
+
21
+ module PointersManipulator
22
+ def new_pointer(i, dict)
23
+ Pointer.new(i-1, dict.l, dict.r, 0, dict)
24
+ end
25
+
26
+ def transit(pointers, ch)
27
+ pointers.map{|p| p.update(ch)}.reject(&:nil?)
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ class SpaceSlicer
2
+ attr_reader :s, :offset, :final
3
+
4
+ def initialize(s)
5
+ @s = s
6
+ @offset = 0
7
+ @final = false
8
+ end
9
+
10
+ def transit(ch, next_ch)
11
+ current_is_space = (ch =~ /\s/)
12
+ next_is_space = (not nil? and next_ch =~ /\s/)
13
+
14
+ if current_is_space and next_is_space
15
+ @offset += 1
16
+ elsif current_is_space and not next_is_space
17
+ @offset += 1
18
+ @final = true
19
+ elsif not current_is_space
20
+ @final = false
21
+ @s += @offset
22
+ @s += 1
23
+ @offset = 0
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,15 @@
1
+ require_relative "dag.rb"
2
+
3
+ module Tokenizer
4
+ def tokenize(txt)
5
+ @dag_class.build(@dict, txt).tokens(txt)
6
+ end
7
+ end
8
+
9
+ class BasicTokenizer
10
+ include Tokenizer
11
+ def initialize(dict)
12
+ @dict = dict
13
+ @dag_class = BasicDag
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wordcut
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Vee Satayamas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-05-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Word segmentation tools for ASEAN languages written in Ruby
14
+ email:
15
+ - v.satayamas@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - wordcut/dag.rb
22
+ - wordcut/dict.rb
23
+ - wordcut/dict_seek.rb
24
+ - wordcut/edge.rb
25
+ - wordcut/edge_builder.rb
26
+ - wordcut/pointer.rb
27
+ - wordcut/space_slicer.rb
28
+ - wordcut/tokenizer.rb
29
+ homepage: https://github.com/veer66/wordcut
30
+ licenses:
31
+ - LGPL-3.0
32
+ metadata: {}
33
+ post_install_message:
34
+ rdoc_options: []
35
+ require_paths:
36
+ - wordcut
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: 2.3.0
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 2.5.1
50
+ signing_key:
51
+ specification_version: 4
52
+ summary: Word segmetation tools for ASEAN languages
53
+ test_files: []