wordcut 0.0.1 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/wordcut/dag.rb CHANGED
@@ -2,98 +2,102 @@ require_relative "edge_builder"
2
2
  require_relative "pointer"
3
3
  require_relative "space_slicer"
4
4
 
5
- module DictDagUpdater
6
- def update_by_dict(i, pointers)
7
- edge = self.build_edges(pointers).min
8
- self[i] = edge
9
- return i
5
+
6
+ module Wordcut
7
+ module DictDagUpdater
8
+ def update_by_dict(i, pointers)
9
+ edge = self.build_edges(pointers).min
10
+ self[i] = edge
11
+ return i
12
+ end
10
13
  end
11
- end
12
14
 
13
- module UnkDagUpdater
14
- def update_by_unk(i, left)
15
- src = self[left]
16
- edge = Edge.new(:s => left,
17
- :unk => src.unk + 1,
18
- :chunk => src.chunk + 1,
19
- :etype => :UNK,
20
- :payload => nil)
21
- self[i] = edge
22
- return left
15
+ module UnkDagUpdater
16
+ def update_by_unk(i, left)
17
+ src = self[left]
18
+ edge = Edge.new(:s => left,
19
+ :unk => src.unk + 1,
20
+ :chunk => src.chunk + 1,
21
+ :etype => :UNK,
22
+ :payload => nil)
23
+ self[i] = edge
24
+ return left
25
+ end
23
26
  end
24
- end
25
27
 
26
- module SpaceDagUpdater
27
- def update_by_space(i, slicer)
28
- s = slicer.s
29
- src = self[s]
30
- edge = Edge.new(:s => s,
31
- :unk => src.unk,
32
- :chunk => src.chunk + 1,
33
- :etype => :SPACE,
34
- :payload => nil)
35
- self[i] = edge
36
- return i
28
+ module SpaceDagUpdater
29
+ def update_by_space(i, slicer)
30
+ s = slicer.s
31
+ src = self[s]
32
+ edge = Edge.new(:s => s,
33
+ :unk => src.unk,
34
+ :chunk => src.chunk + 1,
35
+ :etype => :SPACE,
36
+ :payload => nil)
37
+ self[i] = edge
38
+ return i
39
+ end
37
40
  end
38
- end
39
41
 
40
- module BasicDagUpdater
41
- include DictDagUpdater
42
- include UnkDagUpdater
43
- include SpaceDagUpdater
44
- include PointersManipulator
45
-
46
- def update(i, left, pointers, space_slicer)
47
- if not pointers&.empty?
48
- update_by_dict(i, pointers)
49
- elsif space_slicer&.final
50
- update_by_space(i, space_slicer)
51
- else
52
- update_by_unk(i, left)
42
+ module BasicDagUpdater
43
+ include DictDagUpdater
44
+ include UnkDagUpdater
45
+ include SpaceDagUpdater
46
+ include PointersManipulator
47
+
48
+ def update(i, left, pointers, space_slicer)
49
+ if not pointers&.empty?
50
+ update_by_dict(i, pointers)
51
+ elsif space_slicer&.final
52
+ update_by_space(i, space_slicer)
53
+ else
54
+ update_by_unk(i, left)
55
+ end
53
56
  end
54
57
  end
55
- end
56
58
 
57
- module DagBuilder
58
- def build(dict, txt)
59
- self[0] = init_edge
60
- pointers = []
61
- left = 0
62
- space_slicer = SpaceSlicer.new(0)
63
- for i in 1..txt.length
64
- ch = txt[i - 1]
65
- next_ch = i < txt.length ? txt[i] : nil
66
- space_slicer.transit(ch, next_ch)
67
- pointers << new_pointer(i, dict)
68
- pointers = transit(pointers, ch)
69
- left = update(i, left, pointers.select(&:final), space_slicer)
59
+ module DagBuilder
60
+ def build(dict, txt)
61
+ self[0] = init_edge
62
+ pointers = []
63
+ left = 0
64
+ space_slicer = SpaceSlicer.new(0)
65
+ for i in 1..txt.length
66
+ ch = txt[i - 1]
67
+ next_ch = i < txt.length ? txt[i] : nil
68
+ space_slicer.transit(ch, next_ch)
69
+ pointers << new_pointer(i, dict)
70
+ pointers = transit(pointers, ch)
71
+ left = update(i, left, pointers.select(&:final), space_slicer)
72
+ end
70
73
  end
71
74
  end
72
- end
73
75
 
74
- module DagToToken
75
- def tokens(txt)
76
- toks = []
77
- i = txt.length
78
- while i > 0
79
- s = self[i].s
80
- tok = txt.slice(s, i-s)
81
- toks << tok
82
- i = s
76
+ module DagToToken
77
+ def tokens(txt)
78
+ toks = []
79
+ i = txt.length
80
+ while i > 0
81
+ s = self[i].s
82
+ tok = txt.slice(s, i-s)
83
+ toks << tok
84
+ i = s
85
+ end
86
+ toks.reverse
83
87
  end
84
- toks.reverse
85
88
  end
86
- end
87
89
 
88
- class BasicDag < Array
89
- include EdgeBuilder
90
- include BasicDagUpdater
91
- include DagBuilder
92
- include DagToToken
93
-
94
- def self.build(dict, txt)
95
- dag = BasicDag.new(txt.length + 1)
96
- dag.build(dict, txt)
97
- return dag
90
+ class BasicDag < Array
91
+ include EdgeBuilder
92
+ include BasicDagUpdater
93
+ include DagBuilder
94
+ include DagToToken
95
+
96
+ def self.build(dict, txt)
97
+ dag = BasicDag.new(txt.length + 1)
98
+ dag.build(dict, txt)
99
+ return dag
100
+ end
98
101
  end
102
+
99
103
  end
data/wordcut/dict.rb CHANGED
@@ -1,51 +1,52 @@
1
1
  require_relative "dict_seek"
2
-
3
- class WordItem
4
- attr_reader :headword
5
- def initialize(headword)
6
- @headword = headword
7
- end
8
- end
9
-
10
- module DictInfo
11
- def l
12
- 0
2
+ module Wordcut
3
+ class WordItem
4
+ attr_reader :headword
5
+ def initialize(headword)
6
+ @headword = headword
7
+ end
13
8
  end
14
9
 
15
- def r
16
- return nil if self.empty?
17
- self.length - 1
18
- end
19
- end
10
+ module DictInfo
11
+ def l
12
+ 0
13
+ end
20
14
 
21
- module PathResolver
22
- def resolve_path(lang, name)
23
- File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
15
+ def r
16
+ return nil if self.empty?
17
+ self.length - 1
18
+ end
24
19
  end
25
- end
26
20
 
27
- module BasicDictLoader
28
- include PathResolver
29
- def load_bundle(lang, name)
30
- load(resolve_path(lang, name))
21
+ module PathResolver
22
+ def resolve_path(lang, name)
23
+ File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
24
+ end
31
25
  end
32
26
 
33
- def load(path)
34
- self.concat(open(path).each_line
35
- .map(&:strip)
36
- .reject(&:empty?)
37
- .map{|w| WordItem.new w})
27
+ module BasicDictLoader
28
+ include PathResolver
29
+ def load_bundle(lang, name)
30
+ load(resolve_path(lang, name))
31
+ end
32
+
33
+ def load(path)
34
+ self.concat(open(path).each_line
35
+ .map(&:strip)
36
+ .reject(&:empty?)
37
+ .map{|w| WordItem.new w})
38
+ end
38
39
  end
39
- end
40
40
 
41
- class BasicDict < Array
42
- include DictInfo
43
- include DictSeeker
44
- include BasicDictLoader
41
+ class BasicDict < Array
42
+ include DictInfo
43
+ include DictSeeker
44
+ include BasicDictLoader
45
45
 
46
- def self.from_bundle(lang, name)
47
- dict = self.new
48
- dict.load_bundle(lang, name)
49
- return dict
46
+ def self.from_bundle(lang, name)
47
+ dict = self.new
48
+ dict.load_bundle(lang, name)
49
+ return dict
50
+ end
50
51
  end
51
52
  end
data/wordcut/dict_seek.rb CHANGED
@@ -1,28 +1,30 @@
1
- module DictSeeker
2
- def seek(ch, l, r, offset, policy)
3
- idx = nil
4
- while l <= r
5
- m = (l + r) / 2
6
- w = self[m].headword
7
- wlen = w.length
1
+ module Wordcut
2
+ module DictSeeker
3
+ def seek(ch, l, r, offset, policy)
4
+ idx = nil
5
+ while l <= r
6
+ m = (l + r) / 2
7
+ w = self[m].headword
8
+ wlen = w.length
8
9
 
9
- if wlen <= offset
10
- l = m + 1
11
- else
12
- ch_w = w[offset]
13
- if ch_w < ch
10
+ if wlen <= offset
14
11
  l = m + 1
15
- elsif ch_w > ch
16
- r = m - 1
17
- elsif policy == :LEFT
18
- idx = m
19
- r = m - 1
20
- elsif policy == :RIGHT
21
- idx = m
22
- l = m + 1
23
- end
12
+ else
13
+ ch_w = w[offset]
14
+ if ch_w < ch
15
+ l = m + 1
16
+ elsif ch_w > ch
17
+ r = m - 1
18
+ elsif policy == :LEFT
19
+ idx = m
20
+ r = m - 1
21
+ elsif policy == :RIGHT
22
+ idx = m
23
+ l = m + 1
24
+ end
25
+ end
24
26
  end
27
+ return idx
25
28
  end
26
- return idx
27
29
  end
28
30
  end
data/wordcut/edge.rb CHANGED
@@ -1,21 +1,25 @@
1
- class Edge
2
- attr_reader :unk, :chunk, :s, :payload, :etype
3
-
4
- CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
5
-
6
- def initialize(args = {})
7
- @unk = args[:unk] || 0
8
- @chunk = args[:chunk] || 0
9
- @s = args[:s] || 0
10
- @payload = args[:payload]
11
- @etype = args[:etype]
12
- end
1
+ module Wordcut
2
+
3
+ class Edge
4
+ attr_reader :unk, :chunk, :s, :payload, :etype
5
+
6
+ CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
7
+
8
+ def initialize(args = {})
9
+ @unk = args[:unk] || 0
10
+ @chunk = args[:chunk] || 0
11
+ @s = args[:s] || 0
12
+ @payload = args[:payload]
13
+ @etype = args[:etype]
14
+ end
13
15
 
14
- def <=>(o)
15
- for fn in CMP_FUNCS
16
- cmp = fn.call(self) <=> fn.call(o)
17
- return cmp if cmp != 0
16
+ def <=>(o)
17
+ for fn in CMP_FUNCS
18
+ cmp = fn.call(self) <=> fn.call(o)
19
+ return cmp if cmp != 0
20
+ end
21
+ return 0
18
22
  end
19
- return 0
20
23
  end
24
+
21
25
  end
@@ -1,18 +1,20 @@
1
1
  require_relative "edge.rb"
2
2
 
3
- module EdgeBuilder
4
- def init_edge
5
- Edge.new
6
- end
7
-
8
- def build_edges(pointers)
9
- pointers.map do |pointer|
10
- src = self[pointer.s]
11
- Edge.new(:s => pointer.s,
12
- :unk => src.unk,
13
- :chunk => src.chunk + 1,
14
- :etype => :DICT,
15
- :payload => nil)
16
- end
3
+ module Wordcut
4
+ module EdgeBuilder
5
+ def init_edge
6
+ Edge.new
7
+ end
8
+
9
+ def build_edges(pointers)
10
+ pointers.map do |pointer|
11
+ src = self[pointer.s]
12
+ Edge.new(:s => pointer.s,
13
+ :unk => src.unk,
14
+ :chunk => src.chunk + 1,
15
+ :etype => :DICT,
16
+ :payload => nil)
17
+ end
18
+ end
17
19
  end
18
20
  end
data/wordcut/pointer.rb CHANGED
@@ -1,29 +1,32 @@
1
- class Pointer
2
- attr_reader :s, :l, :r, :offset, :dict, :final
3
- def initialize(s, l, r, offset, dict, final=false)
4
- @s = s
5
- @l = l
6
- @r = r
7
- @offset = offset
8
- @dict = dict
9
- @final = final
10
- end
1
+ module Wordcut
11
2
 
12
- def update(ch)
13
- l = @dict.seek(ch, @l, @r, @offset, :LEFT)
14
- return nil unless l
15
- r = @dict.seek(ch, l, @r, @offset, :RIGHT)
16
- final = (@dict[l].headword.length == @offset + 1)
17
- self.class.new(@s, l, r, @offset + 1, @dict, final)
18
- end
19
- end
3
+ class Pointer
4
+ attr_reader :s, :l, :r, :offset, :dict, :final
5
+ def initialize(s, l, r, offset, dict, final=false)
6
+ @s = s
7
+ @l = l
8
+ @r = r
9
+ @offset = offset
10
+ @dict = dict
11
+ @final = final
12
+ end
20
13
 
21
- module PointersManipulator
22
- def new_pointer(i, dict)
23
- Pointer.new(i-1, dict.l, dict.r, 0, dict)
14
+ def update(ch)
15
+ l = @dict.seek(ch, @l, @r, @offset, :LEFT)
16
+ return nil unless l
17
+ r = @dict.seek(ch, l, @r, @offset, :RIGHT)
18
+ final = (@dict[l].headword.length == @offset + 1)
19
+ self.class.new(@s, l, r, @offset + 1, @dict, final)
20
+ end
24
21
  end
25
-
26
- def transit(pointers, ch)
27
- pointers.map{|p| p.update(ch)}.reject(&:nil?)
22
+
23
+ module PointersManipulator
24
+ def new_pointer(i, dict)
25
+ Pointer.new(i-1, dict.l, dict.r, 0, dict)
26
+ end
27
+
28
+ def transit(pointers, ch)
29
+ pointers.map{|p| p.update(ch)}.reject(&:nil?)
30
+ end
28
31
  end
29
32
  end
@@ -1,26 +1,30 @@
1
- class SpaceSlicer
2
- attr_reader :s, :offset, :final
3
-
4
- def initialize(s)
5
- @s = s
6
- @offset = 0
7
- @final = false
8
- end
9
-
10
- def transit(ch, next_ch)
11
- current_is_space = (ch =~ /\s/)
12
- next_is_space = (not nil? and next_ch =~ /\s/)
1
+ module Wordcut
2
+
3
+ class SpaceSlicer
4
+ attr_reader :s, :offset, :final
13
5
 
14
- if current_is_space and next_is_space
15
- @offset += 1
16
- elsif current_is_space and not next_is_space
17
- @offset += 1
18
- @final = true
19
- elsif not current_is_space
20
- @final = false
21
- @s += @offset
22
- @s += 1
6
+ def initialize(s)
7
+ @s = s
23
8
  @offset = 0
9
+ @final = false
10
+ end
11
+
12
+ def transit(ch, next_ch)
13
+ current_is_space = (ch =~ /\s/)
14
+ next_is_space = (not nil? and next_ch =~ /\s/)
15
+
16
+ if current_is_space and next_is_space
17
+ @offset += 1
18
+ elsif current_is_space and not next_is_space
19
+ @offset += 1
20
+ @final = true
21
+ elsif not current_is_space
22
+ @final = false
23
+ @s += @offset
24
+ @s += 1
25
+ @offset = 0
26
+ end
24
27
  end
25
28
  end
29
+
26
30
  end
data/wordcut/tokenizer.rb CHANGED
@@ -1,15 +1,17 @@
1
1
  require_relative "dag.rb"
2
2
 
3
- module Tokenizer
4
- def tokenize(txt)
5
- @dag_class.build(@dict, txt).tokens(txt)
3
+ module Wordcut
4
+ module Tokenizer
5
+ def tokenize(txt)
6
+ @dag_class.build(@dict, txt).tokens(txt)
7
+ end
6
8
  end
7
- end
8
9
 
9
- class BasicTokenizer
10
- include Tokenizer
11
- def initialize(dict)
12
- @dict = dict
13
- @dag_class = BasicDag
10
+ class BasicTokenizer
11
+ include Tokenizer
12
+ def initialize(dict)
13
+ @dict = dict
14
+ @dag_class = BasicDag
15
+ end
14
16
  end
15
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordcut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vee Satayamas
@@ -17,7 +17,23 @@ executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - LICENSE
20
21
  - README.md
22
+ - data/tha/tdict-acronyms.txt
23
+ - data/tha/tdict-city.txt
24
+ - data/tha/tdict-collection.txt
25
+ - data/tha/tdict-common.txt
26
+ - data/tha/tdict-country.txt
27
+ - data/tha/tdict-district.txt
28
+ - data/tha/tdict-geo.txt
29
+ - data/tha/tdict-history.txt
30
+ - data/tha/tdict-ict.txt
31
+ - data/tha/tdict-lang-ethnic.txt
32
+ - data/tha/tdict-proper.txt
33
+ - data/tha/tdict-science.txt
34
+ - data/tha/tdict-spell.txt
35
+ - data/tha/tdict-std-compound.txt
36
+ - data/tha/tdict-std.txt
21
37
  - wordcut/dag.rb
22
38
  - wordcut/dict.rb
23
39
  - wordcut/dict_seek.rb
@@ -26,14 +42,14 @@ files:
26
42
  - wordcut/pointer.rb
27
43
  - wordcut/space_slicer.rb
28
44
  - wordcut/tokenizer.rb
29
- homepage: https://github.com/veer66/wordcut
45
+ homepage: https://github.com/veer66/wordcut.rb
30
46
  licenses:
31
47
  - LGPL-3.0
32
48
  metadata: {}
33
49
  post_install_message:
34
50
  rdoc_options: []
35
51
  require_paths:
36
- - wordcut
52
+ - "."
37
53
  required_ruby_version: !ruby/object:Gem::Requirement
38
54
  requirements:
39
55
  - - ">="