chinese_sugar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/sugar/trie.rb ADDED
@@ -0,0 +1,110 @@
1
+ require "sugar/version"
2
+
3
+ module Sugar
4
+ class Trie
5
+ # FIXME: should be class method
6
+ attr_accessor :trie, :frequnces, :count
7
+
8
+ def initialize
9
+ dict = File.expand_path("../../dict.txt", __FILE__)
10
+ self.trie, self.frequnces, self.count = Sugar::Trie.build(dict)
11
+ end
12
+
13
+ def word?(word)
14
+ sfx = suffix(word)
15
+ !!sfx && sfx.has_key?('')
16
+ end
17
+
18
+ def suffix(word)
19
+ p = trie
20
+ word.each_char do |char|
21
+ return nil if p[char].nil?
22
+ p = p[char]
23
+ end
24
+ p
25
+ end
26
+
27
+ def frequnce(word)
28
+ frequnces[word]
29
+ end
30
+
31
+ # construct a DAG of sentence
32
+ def DAG(sentence)
33
+ n = sentence.length
34
+ dag = Array.new(n){[]} # [[]]*n
35
+ 0.upto(n-1) do |i|
36
+ sfx = suffix(sentence[i])
37
+ i.upto(n-1) do |j|
38
+ if sfx == nil
39
+ break
40
+ elsif sfx['']
41
+ dag[i].push(j)
42
+ end
43
+ sfx = sfx[sentence[j+1]]
44
+ end
45
+ end
46
+ dag
47
+ end
48
+
49
+ # Viterbi算法,递归过程
50
+ def viterbi_distance(graph, sentence, i, path)
51
+ if i < graph.size
52
+ max, node = -1, -1
53
+ graph[i].map.with_index do |j, index|
54
+ path[j+1] = {}
55
+ distance = viterbi_distance(graph, sentence, j+1, path[j+1])*possibility(sentence[i..j])
56
+ if max < distance # find the min distance
57
+ max, node = distance, index
58
+ end
59
+ distance
60
+ end.each_with_index do |distance, index|
61
+ path.delete(graph[i][index] + 1) if distance != max
62
+ end
63
+ max
64
+ else
65
+ 1
66
+ end
67
+ end
68
+
69
+ def possibility(word)
70
+ 1.0*frequnce(word)/count
71
+ end
72
+
73
+ #
74
+ def best_segmentation(sentence)
75
+ dag = self.DAG(sentence)
76
+ path = {1 => {}}
77
+ min_distance = viterbi_distance(dag, sentence, 0, path)
78
+ return [min_distance, path]
79
+ end
80
+
81
+ # load from dict.txt
82
+ def self.build(dict)
83
+ trie, frequnces, count = {}, {}, 0
84
+ File.read(dict).split("\n").each do |line|
85
+ word, freq, _ = line.rstrip.split(' ')
86
+ count += (frequnces[word] = freq.to_i)
87
+ p = trie # reference pointer
88
+ word.each_char do |char|
89
+ p[char] = {} if p[char].nil?
90
+ p = p[char]
91
+ end
92
+ p[''] = '' # label the end of word
93
+ end
94
+ [trie, frequnces, count]
95
+ end
96
+
97
+ # load from cache
98
+ def self.load(tempfile)
99
+ Marshal.load(tempfile.read)
100
+ end
101
+
102
+ def dump(tempfile)
103
+ tempfile.write(Marshal.dump(self))
104
+ end
105
+
106
+ def insepct
107
+ "#<trie: #{trie.keys[0..10].join(',')}..., count: #{count}>"
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,3 @@
1
+ module Sugar
2
+ VERSION = "0.0.1"
3
+ end
data/lib/sugar.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "sugar/version"
2
+
3
+ module Sugar
4
+
5
+ end
data/sugar.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'sugar/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "chinese_sugar"
8
+ spec.version = Sugar::VERSION
9
+ spec.authors = ["binz"]
10
+ spec.email = ["xinkiang@gmail.com"]
11
+ spec.summary = %q{Chinese text data mining.}
12
+ spec.description = %q{1.Chinese words segmentation using Trie and Viterbi}
13
+ spec.homepage = "https://github.com/slacken/sugar"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rake"
23
+ end
data/sugar.rb ADDED
@@ -0,0 +1,14 @@
1
+ $LOAD_PATH.unshift(File.expand_path('../lib', __FILE__))
2
+
3
+ require 'sugar/trie'
4
+ require 'tempfile'
5
+
6
+
7
+ t = Sugar::Trie.new
8
+ str = '到底有没有自然灾害自然灾害到了什么程度我不是很清楚但我希望这个回答能终结知乎上对大饥荒死亡人数的疑问'
9
+ puts str
10
+ dag = t.DAG(str)
11
+
12
+ puts dag.to_s
13
+
14
+ puts t.best_segmentation(str).to_s
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chinese_sugar
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - binz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: 1.Chinese words segmentation using Trie and Viterbi
42
+ email:
43
+ - xinkiang@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - lib/dict.txt
54
+ - lib/sugar.rb
55
+ - lib/sugar/trie.rb
56
+ - lib/sugar/version.rb
57
+ - sugar.gemspec
58
+ - sugar.rb
59
+ homepage: https://github.com/slacken/sugar
60
+ licenses:
61
+ - MIT
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ requirements: []
78
+ rubyforge_project:
79
+ rubygems_version: 2.2.2
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Chinese text data mining.
83
+ test_files: []