wordcut 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/wordcut/dag.rb CHANGED
@@ -2,98 +2,102 @@ require_relative "edge_builder"
2
2
  require_relative "pointer"
3
3
  require_relative "space_slicer"
4
4
 
5
- module DictDagUpdater
6
- def update_by_dict(i, pointers)
7
- edge = self.build_edges(pointers).min
8
- self[i] = edge
9
- return i
5
+
6
+ module Wordcut
7
+ module DictDagUpdater
8
+ def update_by_dict(i, pointers)
9
+ edge = self.build_edges(pointers).min
10
+ self[i] = edge
11
+ return i
12
+ end
10
13
  end
11
- end
12
14
 
13
- module UnkDagUpdater
14
- def update_by_unk(i, left)
15
- src = self[left]
16
- edge = Edge.new(:s => left,
17
- :unk => src.unk + 1,
18
- :chunk => src.chunk + 1,
19
- :etype => :UNK,
20
- :payload => nil)
21
- self[i] = edge
22
- return left
15
+ module UnkDagUpdater
16
+ def update_by_unk(i, left)
17
+ src = self[left]
18
+ edge = Edge.new(:s => left,
19
+ :unk => src.unk + 1,
20
+ :chunk => src.chunk + 1,
21
+ :etype => :UNK,
22
+ :payload => nil)
23
+ self[i] = edge
24
+ return left
25
+ end
23
26
  end
24
- end
25
27
 
26
- module SpaceDagUpdater
27
- def update_by_space(i, slicer)
28
- s = slicer.s
29
- src = self[s]
30
- edge = Edge.new(:s => s,
31
- :unk => src.unk,
32
- :chunk => src.chunk + 1,
33
- :etype => :SPACE,
34
- :payload => nil)
35
- self[i] = edge
36
- return i
28
+ module SpaceDagUpdater
29
+ def update_by_space(i, slicer)
30
+ s = slicer.s
31
+ src = self[s]
32
+ edge = Edge.new(:s => s,
33
+ :unk => src.unk,
34
+ :chunk => src.chunk + 1,
35
+ :etype => :SPACE,
36
+ :payload => nil)
37
+ self[i] = edge
38
+ return i
39
+ end
37
40
  end
38
- end
39
41
 
40
- module BasicDagUpdater
41
- include DictDagUpdater
42
- include UnkDagUpdater
43
- include SpaceDagUpdater
44
- include PointersManipulator
45
-
46
- def update(i, left, pointers, space_slicer)
47
- if not pointers&.empty?
48
- update_by_dict(i, pointers)
49
- elsif space_slicer&.final
50
- update_by_space(i, space_slicer)
51
- else
52
- update_by_unk(i, left)
42
+ module BasicDagUpdater
43
+ include DictDagUpdater
44
+ include UnkDagUpdater
45
+ include SpaceDagUpdater
46
+ include PointersManipulator
47
+
48
+ def update(i, left, pointers, space_slicer)
49
+ if not pointers&.empty?
50
+ update_by_dict(i, pointers)
51
+ elsif space_slicer&.final
52
+ update_by_space(i, space_slicer)
53
+ else
54
+ update_by_unk(i, left)
55
+ end
53
56
  end
54
57
  end
55
- end
56
58
 
57
- module DagBuilder
58
- def build(dict, txt)
59
- self[0] = init_edge
60
- pointers = []
61
- left = 0
62
- space_slicer = SpaceSlicer.new(0)
63
- for i in 1..txt.length
64
- ch = txt[i - 1]
65
- next_ch = i < txt.length ? txt[i] : nil
66
- space_slicer.transit(ch, next_ch)
67
- pointers << new_pointer(i, dict)
68
- pointers = transit(pointers, ch)
69
- left = update(i, left, pointers.select(&:final), space_slicer)
59
+ module DagBuilder
60
+ def build(dict, txt)
61
+ self[0] = init_edge
62
+ pointers = []
63
+ left = 0
64
+ space_slicer = SpaceSlicer.new(0)
65
+ for i in 1..txt.length
66
+ ch = txt[i - 1]
67
+ next_ch = i < txt.length ? txt[i] : nil
68
+ space_slicer.transit(ch, next_ch)
69
+ pointers << new_pointer(i, dict)
70
+ pointers = transit(pointers, ch)
71
+ left = update(i, left, pointers.select(&:final), space_slicer)
72
+ end
70
73
  end
71
74
  end
72
- end
73
75
 
74
- module DagToToken
75
- def tokens(txt)
76
- toks = []
77
- i = txt.length
78
- while i > 0
79
- s = self[i].s
80
- tok = txt.slice(s, i-s)
81
- toks << tok
82
- i = s
76
+ module DagToToken
77
+ def tokens(txt)
78
+ toks = []
79
+ i = txt.length
80
+ while i > 0
81
+ s = self[i].s
82
+ tok = txt.slice(s, i-s)
83
+ toks << tok
84
+ i = s
85
+ end
86
+ toks.reverse
83
87
  end
84
- toks.reverse
85
88
  end
86
- end
87
89
 
88
- class BasicDag < Array
89
- include EdgeBuilder
90
- include BasicDagUpdater
91
- include DagBuilder
92
- include DagToToken
93
-
94
- def self.build(dict, txt)
95
- dag = BasicDag.new(txt.length + 1)
96
- dag.build(dict, txt)
97
- return dag
90
+ class BasicDag < Array
91
+ include EdgeBuilder
92
+ include BasicDagUpdater
93
+ include DagBuilder
94
+ include DagToToken
95
+
96
+ def self.build(dict, txt)
97
+ dag = BasicDag.new(txt.length + 1)
98
+ dag.build(dict, txt)
99
+ return dag
100
+ end
98
101
  end
102
+
99
103
  end
data/wordcut/dict.rb CHANGED
@@ -1,51 +1,52 @@
1
1
  require_relative "dict_seek"
2
-
3
- class WordItem
4
- attr_reader :headword
5
- def initialize(headword)
6
- @headword = headword
7
- end
8
- end
9
-
10
- module DictInfo
11
- def l
12
- 0
2
+ module Wordcut
3
+ class WordItem
4
+ attr_reader :headword
5
+ def initialize(headword)
6
+ @headword = headword
7
+ end
13
8
  end
14
9
 
15
- def r
16
- return nil if self.empty?
17
- self.length - 1
18
- end
19
- end
10
+ module DictInfo
11
+ def l
12
+ 0
13
+ end
20
14
 
21
- module PathResolver
22
- def resolve_path(lang, name)
23
- File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
15
+ def r
16
+ return nil if self.empty?
17
+ self.length - 1
18
+ end
24
19
  end
25
- end
26
20
 
27
- module BasicDictLoader
28
- include PathResolver
29
- def load_bundle(lang, name)
30
- load(resolve_path(lang, name))
21
+ module PathResolver
22
+ def resolve_path(lang, name)
23
+ File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
24
+ end
31
25
  end
32
26
 
33
- def load(path)
34
- self.concat(open(path).each_line
35
- .map(&:strip)
36
- .reject(&:empty?)
37
- .map{|w| WordItem.new w})
27
+ module BasicDictLoader
28
+ include PathResolver
29
+ def load_bundle(lang, name)
30
+ load(resolve_path(lang, name))
31
+ end
32
+
33
+ def load(path)
34
+ self.concat(open(path).each_line
35
+ .map(&:strip)
36
+ .reject(&:empty?)
37
+ .map{|w| WordItem.new w})
38
+ end
38
39
  end
39
- end
40
40
 
41
- class BasicDict < Array
42
- include DictInfo
43
- include DictSeeker
44
- include BasicDictLoader
41
+ class BasicDict < Array
42
+ include DictInfo
43
+ include DictSeeker
44
+ include BasicDictLoader
45
45
 
46
- def self.from_bundle(lang, name)
47
- dict = self.new
48
- dict.load_bundle(lang, name)
49
- return dict
46
+ def self.from_bundle(lang, name)
47
+ dict = self.new
48
+ dict.load_bundle(lang, name)
49
+ return dict
50
+ end
50
51
  end
51
52
  end
data/wordcut/dict_seek.rb CHANGED
@@ -1,28 +1,30 @@
1
- module DictSeeker
2
- def seek(ch, l, r, offset, policy)
3
- idx = nil
4
- while l <= r
5
- m = (l + r) / 2
6
- w = self[m].headword
7
- wlen = w.length
1
+ module Wordcut
2
+ module DictSeeker
3
+ def seek(ch, l, r, offset, policy)
4
+ idx = nil
5
+ while l <= r
6
+ m = (l + r) / 2
7
+ w = self[m].headword
8
+ wlen = w.length
8
9
 
9
- if wlen <= offset
10
- l = m + 1
11
- else
12
- ch_w = w[offset]
13
- if ch_w < ch
10
+ if wlen <= offset
14
11
  l = m + 1
15
- elsif ch_w > ch
16
- r = m - 1
17
- elsif policy == :LEFT
18
- idx = m
19
- r = m - 1
20
- elsif policy == :RIGHT
21
- idx = m
22
- l = m + 1
23
- end
12
+ else
13
+ ch_w = w[offset]
14
+ if ch_w < ch
15
+ l = m + 1
16
+ elsif ch_w > ch
17
+ r = m - 1
18
+ elsif policy == :LEFT
19
+ idx = m
20
+ r = m - 1
21
+ elsif policy == :RIGHT
22
+ idx = m
23
+ l = m + 1
24
+ end
25
+ end
24
26
  end
27
+ return idx
25
28
  end
26
- return idx
27
29
  end
28
30
  end
data/wordcut/edge.rb CHANGED
@@ -1,21 +1,25 @@
1
- class Edge
2
- attr_reader :unk, :chunk, :s, :payload, :etype
3
-
4
- CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
5
-
6
- def initialize(args = {})
7
- @unk = args[:unk] || 0
8
- @chunk = args[:chunk] || 0
9
- @s = args[:s] || 0
10
- @payload = args[:payload]
11
- @etype = args[:etype]
12
- end
1
+ module Wordcut
2
+
3
+ class Edge
4
+ attr_reader :unk, :chunk, :s, :payload, :etype
5
+
6
+ CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
7
+
8
+ def initialize(args = {})
9
+ @unk = args[:unk] || 0
10
+ @chunk = args[:chunk] || 0
11
+ @s = args[:s] || 0
12
+ @payload = args[:payload]
13
+ @etype = args[:etype]
14
+ end
13
15
 
14
- def <=>(o)
15
- for fn in CMP_FUNCS
16
- cmp = fn.call(self) <=> fn.call(o)
17
- return cmp if cmp != 0
16
+ def <=>(o)
17
+ for fn in CMP_FUNCS
18
+ cmp = fn.call(self) <=> fn.call(o)
19
+ return cmp if cmp != 0
20
+ end
21
+ return 0
18
22
  end
19
- return 0
20
23
  end
24
+
21
25
  end
@@ -1,18 +1,20 @@
1
1
  require_relative "edge.rb"
2
2
 
3
- module EdgeBuilder
4
- def init_edge
5
- Edge.new
6
- end
7
-
8
- def build_edges(pointers)
9
- pointers.map do |pointer|
10
- src = self[pointer.s]
11
- Edge.new(:s => pointer.s,
12
- :unk => src.unk,
13
- :chunk => src.chunk + 1,
14
- :etype => :DICT,
15
- :payload => nil)
16
- end
3
+ module Wordcut
4
+ module EdgeBuilder
5
+ def init_edge
6
+ Edge.new
7
+ end
8
+
9
+ def build_edges(pointers)
10
+ pointers.map do |pointer|
11
+ src = self[pointer.s]
12
+ Edge.new(:s => pointer.s,
13
+ :unk => src.unk,
14
+ :chunk => src.chunk + 1,
15
+ :etype => :DICT,
16
+ :payload => nil)
17
+ end
18
+ end
17
19
  end
18
20
  end
data/wordcut/pointer.rb CHANGED
@@ -1,29 +1,32 @@
1
- class Pointer
2
- attr_reader :s, :l, :r, :offset, :dict, :final
3
- def initialize(s, l, r, offset, dict, final=false)
4
- @s = s
5
- @l = l
6
- @r = r
7
- @offset = offset
8
- @dict = dict
9
- @final = final
10
- end
1
+ module Wordcut
11
2
 
12
- def update(ch)
13
- l = @dict.seek(ch, @l, @r, @offset, :LEFT)
14
- return nil unless l
15
- r = @dict.seek(ch, l, @r, @offset, :RIGHT)
16
- final = (@dict[l].headword.length == @offset + 1)
17
- self.class.new(@s, l, r, @offset + 1, @dict, final)
18
- end
19
- end
3
+ class Pointer
4
+ attr_reader :s, :l, :r, :offset, :dict, :final
5
+ def initialize(s, l, r, offset, dict, final=false)
6
+ @s = s
7
+ @l = l
8
+ @r = r
9
+ @offset = offset
10
+ @dict = dict
11
+ @final = final
12
+ end
20
13
 
21
- module PointersManipulator
22
- def new_pointer(i, dict)
23
- Pointer.new(i-1, dict.l, dict.r, 0, dict)
14
+ def update(ch)
15
+ l = @dict.seek(ch, @l, @r, @offset, :LEFT)
16
+ return nil unless l
17
+ r = @dict.seek(ch, l, @r, @offset, :RIGHT)
18
+ final = (@dict[l].headword.length == @offset + 1)
19
+ self.class.new(@s, l, r, @offset + 1, @dict, final)
20
+ end
24
21
  end
25
-
26
- def transit(pointers, ch)
27
- pointers.map{|p| p.update(ch)}.reject(&:nil?)
22
+
23
+ module PointersManipulator
24
+ def new_pointer(i, dict)
25
+ Pointer.new(i-1, dict.l, dict.r, 0, dict)
26
+ end
27
+
28
+ def transit(pointers, ch)
29
+ pointers.map{|p| p.update(ch)}.reject(&:nil?)
30
+ end
28
31
  end
29
32
  end
@@ -1,26 +1,30 @@
1
- class SpaceSlicer
2
- attr_reader :s, :offset, :final
3
-
4
- def initialize(s)
5
- @s = s
6
- @offset = 0
7
- @final = false
8
- end
9
-
10
- def transit(ch, next_ch)
11
- current_is_space = (ch =~ /\s/)
12
- next_is_space = (not nil? and next_ch =~ /\s/)
1
+ module Wordcut
2
+
3
+ class SpaceSlicer
4
+ attr_reader :s, :offset, :final
13
5
 
14
- if current_is_space and next_is_space
15
- @offset += 1
16
- elsif current_is_space and not next_is_space
17
- @offset += 1
18
- @final = true
19
- elsif not current_is_space
20
- @final = false
21
- @s += @offset
22
- @s += 1
6
+ def initialize(s)
7
+ @s = s
23
8
  @offset = 0
9
+ @final = false
10
+ end
11
+
12
+ def transit(ch, next_ch)
13
+ current_is_space = (ch =~ /\s/)
14
+ next_is_space = (not nil? and next_ch =~ /\s/)
15
+
16
+ if current_is_space and next_is_space
17
+ @offset += 1
18
+ elsif current_is_space and not next_is_space
19
+ @offset += 1
20
+ @final = true
21
+ elsif not current_is_space
22
+ @final = false
23
+ @s += @offset
24
+ @s += 1
25
+ @offset = 0
26
+ end
24
27
  end
25
28
  end
29
+
26
30
  end
data/wordcut/tokenizer.rb CHANGED
@@ -1,15 +1,17 @@
1
1
  require_relative "dag.rb"
2
2
 
3
- module Tokenizer
4
- def tokenize(txt)
5
- @dag_class.build(@dict, txt).tokens(txt)
3
+ module Wordcut
4
+ module Tokenizer
5
+ def tokenize(txt)
6
+ @dag_class.build(@dict, txt).tokens(txt)
7
+ end
6
8
  end
7
- end
8
9
 
9
- class BasicTokenizer
10
- include Tokenizer
11
- def initialize(dict)
12
- @dict = dict
13
- @dag_class = BasicDag
10
+ class BasicTokenizer
11
+ include Tokenizer
12
+ def initialize(dict)
13
+ @dict = dict
14
+ @dag_class = BasicDag
15
+ end
14
16
  end
15
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordcut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vee Satayamas
@@ -17,7 +17,23 @@ executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - LICENSE
20
21
  - README.md
22
+ - data/tha/tdict-acronyms.txt
23
+ - data/tha/tdict-city.txt
24
+ - data/tha/tdict-collection.txt
25
+ - data/tha/tdict-common.txt
26
+ - data/tha/tdict-country.txt
27
+ - data/tha/tdict-district.txt
28
+ - data/tha/tdict-geo.txt
29
+ - data/tha/tdict-history.txt
30
+ - data/tha/tdict-ict.txt
31
+ - data/tha/tdict-lang-ethnic.txt
32
+ - data/tha/tdict-proper.txt
33
+ - data/tha/tdict-science.txt
34
+ - data/tha/tdict-spell.txt
35
+ - data/tha/tdict-std-compound.txt
36
+ - data/tha/tdict-std.txt
21
37
  - wordcut/dag.rb
22
38
  - wordcut/dict.rb
23
39
  - wordcut/dict_seek.rb
@@ -26,14 +42,14 @@ files:
26
42
  - wordcut/pointer.rb
27
43
  - wordcut/space_slicer.rb
28
44
  - wordcut/tokenizer.rb
29
- homepage: https://github.com/veer66/wordcut
45
+ homepage: https://github.com/veer66/wordcut.rb
30
46
  licenses:
31
47
  - LGPL-3.0
32
48
  metadata: {}
33
49
  post_install_message:
34
50
  rdoc_options: []
35
51
  require_paths:
36
- - wordcut
52
+ - "."
37
53
  required_ruby_version: !ruby/object:Gem::Requirement
38
54
  requirements:
39
55
  - - ">="