harmonious_dictionary 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/CHANGELOG +3 -0
  4. data/MIT-LICENSE +20 -0
  5. data/README.markdown +53 -0
  6. data/Rakefile +8 -0
  7. data/benchmark/benchmark.rb +43 -0
  8. data/benchmark/text_test_100.txt +4 -0
  9. data/benchmark/text_test_1000.txt +25 -0
  10. data/benchmark/text_test_10000.txt +219 -0
  11. data/bin/harmonious_rseg +11 -0
  12. data/bin/harmonious_server +63 -0
  13. data/harmonious_dictionary.gemspec +20 -0
  14. data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
  15. data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
  16. data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
  17. data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
  18. data/lib/harmonious_dictionary.rb +48 -0
  19. data/lib/harmonious_dictionary/app.rb +18 -0
  20. data/lib/harmonious_dictionary/engines/dict.rb +51 -0
  21. data/lib/harmonious_dictionary/engines/engine.rb +21 -0
  22. data/lib/harmonious_dictionary/engines/english.rb +27 -0
  23. data/lib/harmonious_dictionary/filters/conjunction.rb +11 -0
  24. data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
  25. data/lib/harmonious_dictionary/filters/symbol.rb +16 -0
  26. data/lib/harmonious_dictionary/model_additions.rb +15 -0
  27. data/lib/harmonious_dictionary/railtie.rb +23 -0
  28. data/lib/harmonious_dictionary/rseg.rb +170 -0
  29. data/lib/harmonious_dictionary/version.rb +3 -0
  30. data/lib/tasks/generate_dictionary.rake +55 -0
  31. data/spec/harmonious_dictionary_spec.rb +40 -0
  32. data/spec/model_additions_spec.rb +57 -0
  33. data/spec/spec_helper.rb +21 -0
  34. metadata +99 -0
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "harmonious_dictionary/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harmonious_dictionary"
7
+ s.version = HarmoniousDictionary::VERSION
8
+ s.authors = ["Stephen Kong"]
9
+ s.email = ["wear63659220@gmail.com"]
10
+ s.homepage = "https://github.com/wear/harmonious_dictionary"
11
+ s.summary = %q{filter any words that need to be harmonized}
12
+ s.description = %q{和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_development_dependency "rspec"
20
+ end
@@ -0,0 +1,16 @@
1
+ require "rails/generators"
2
+
3
+ module HarmoniousDictionary
4
+ module Generators
5
+ class SetupGenerator < ::Rails::Generators::Base
6
+ desc "This generator creates necessary at config/harmonious_dictionary"
7
+ source_root File.expand_path("../templates", __FILE__)
8
+
9
+ def generate_setup
10
+ #copy_file "remote_server.yml", "config/harmonious_dictionary/remote_server.yml"
11
+ copy_file "chinese_dictionary.txt", "config/harmonious_dictionary/chinese_dictionary.txt"
12
+ copy_file "english_dictionary.txt", "config/harmonious_dictionary/english_dictionary.txt"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ development:
2
+ url: 127.0.0.1:4100
3
+
4
+ test:
5
+ url: 127.0.0.1:4100
6
+
7
+ production:
8
+ url: 127.0.0.1:4100
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ require "harmonious_dictionary/rseg"
4
+ require "harmonious_dictionary/version"
5
+ require "harmonious_dictionary/model_additions"
6
+ require "harmonious_dictionary/railtie" if defined? Rails
7
+
8
+
9
+ module HarmoniousDictionary
10
+ def self.clean?(input)
11
+ results = HarmoniousDictionary::Rseg.segment(input)
12
+ results.size > 0 ? false : true
13
+ end
14
+
15
+ def self.clean_by_remote?(input)
16
+ results = HarmoniousDictionary::Rseg.remote_segment(input)
17
+ results.size > 0 ? false : true
18
+ end
19
+
20
+ def self.clean_by_remote(input)
21
+ results = HarmoniousDictionary::Rseg.remote_segment(input)
22
+ results.each do |result|
23
+ encode_result = result.force_encoding('utf-8')
24
+ input.gsub! /#{encode_result}/,self.clean_word_basic(encode_result)
25
+ end
26
+ input
27
+ end
28
+
29
+ def self.clean(input)
30
+ results = HarmoniousDictionary::Rseg.segment(input)
31
+ results.each{|result| input.gsub! /#{result}/,self.clean_word_basic(result) }
32
+ input
33
+ end
34
+
35
+ def self.harmonious_words(input)
36
+ return HarmoniousDictionary::Rseg.segment(input)
37
+ end
38
+
39
+ def self.clean_word_basic(word)
40
+ clearn_words = ""
41
+ word.size.times{ clearn_words << "*" }
42
+ clearn_words
43
+ end
44
+
45
+ def self.chinese_harmonious
46
+ Rseg.instance.send(:engines).first.dictionary
47
+ end
48
+ end
@@ -0,0 +1,18 @@
1
+ require 'sinatra/base'
2
+
3
+ module HarmoniousDictionary
4
+ class App < Sinatra::Base
5
+ set :root, File.dirname(__FILE__) + "/.."
6
+ set :app_file, __FILE__
7
+
8
+ post '/segment' do
9
+ @input = params[:input]
10
+ @result = HarmoniousDictionary::Rseg.segment(@input).join(' ')
11
+ end
12
+
13
+ post '/seg' do
14
+ @input = params[:input]
15
+ @result = HarmoniousDictionary::Rseg.segment(@input)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,51 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ class Dict < Engine
4
+ def initialize(&block)
5
+ @dict_path = block.call
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def dictionary
11
+ @@root
12
+ end
13
+
14
+ def process(char)
15
+ @root ||= load_dict(@dict_path)
16
+ @node ||= @root
17
+
18
+ match = false
19
+ word = nil
20
+
21
+ if @node[char]
22
+ @word << char
23
+ @node = @node[char]
24
+ match = true
25
+ else
26
+ if @node[:end] || @word.chars.to_a.length == 1
27
+ word = @word
28
+ else
29
+ word = @word.chars.to_a
30
+ end
31
+
32
+ @node = @root
33
+ @word = ''
34
+ match = false
35
+ end
36
+ [match, word]
37
+ end
38
+
39
+ private
40
+
41
+ def load_dict(path)
42
+ begin
43
+ File.open(path, "rb") {|io| Marshal.load(io) }
44
+ rescue => e
45
+ puts e
46
+ exit
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,21 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ class Engine
4
+ def initialize
5
+ @running = true
6
+ end
7
+
8
+ def stop
9
+ @running = false
10
+ end
11
+
12
+ def run
13
+ @running = true
14
+ end
15
+
16
+ def running?
17
+ @running
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
4
+
5
+ class English < Engine
6
+ def initialize
7
+ @word = ''
8
+ super
9
+ end
10
+
11
+ def process(char)
12
+ match = false
13
+ word = nil
14
+
15
+ if LETTER_SYMBOLS.include?(char)
16
+ @word << char
17
+ match = true
18
+ else
19
+ word = @word
20
+ @word = ''
21
+ match = false
22
+ end
23
+ [match, word]
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Conjunction
5
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
6
+
7
+ def self.filter(char)
8
+ @@conjunctions.include?(char) ? :conjunction : char
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Fullwidth
5
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
6
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
7
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
8
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
9
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
10
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
11
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
12
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
13
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
14
+
15
+ class << self
16
+ def filter(char)
17
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Symbol
5
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
6
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
7
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
8
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
9
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
10
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
11
+ ' ', '-', '/', '+', ',', ' ']
12
+ def self.filter(char)
13
+ @@separators.include?(char) ? :symbol : char
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ module HarmoniousDictionary
4
+ module ModelAdditions
5
+ def validate_harmonious_of(*attr_names)
6
+ configuration = {message:'不能含有敏感词'}
7
+ configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
8
+ validates_each attr_names do |model, attribute, value|
9
+ unless value.blank?
10
+ model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module HarmoniousDictionary
2
+ class Railtie < Rails::Railtie
3
+ # config.harmonious_dictionary = ActiveSupport::OrderedOptions.new
4
+ # config.harmonious_dictionary.use_remote_server = false
5
+ # config.after_initialize do
6
+ # if config.harmonious_dictionary.use_remote_server
7
+ # Rseg.instance.load_remote_url_config
8
+ # end
9
+ # end
10
+
11
+ rake_tasks do
12
+ load "tasks/generate_dictionary.rake"
13
+ end
14
+
15
+ initializer 'HarmoniousDictionary.model_additions' do
16
+ ActiveSupport.on_load :active_record do
17
+ extend ModelAdditions
18
+ end
19
+ end
20
+
21
+
22
+ end
23
+ end
@@ -0,0 +1,170 @@
1
+ # encoding: utf-8
2
+
3
+ require 'singleton'
4
+ require 'net/http'
5
+ require 'yaml'
6
+
7
+ require File.join(File.dirname(__FILE__), 'engines/engine')
8
+ require File.join(File.dirname(__FILE__), 'engines/dict')
9
+ require File.join(File.dirname(__FILE__), 'engines/english')
10
+
11
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
12
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
13
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
14
+
15
+ module HarmoniousDictionary
16
+ class Rseg
17
+ include Singleton
18
+ include RsegEngine
19
+ include RsegFilter
20
+ attr_writer :input
21
+
22
+ class << self
23
+ def segment(input)
24
+ HarmoniousDictionary::Rseg.instance.input = input
25
+ HarmoniousDictionary::Rseg.instance.segment
26
+ end
27
+
28
+ def load(dict)
29
+ HarmoniousDictionary::Rseg.instance
30
+ nil
31
+ end
32
+
33
+ def remote_segment(input)
34
+ begin
35
+ response = Net::HTTP.post_form(URI.parse("http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg"), :input => input)
36
+ response.code == '200' ? response.body.split(' ') :
37
+ ["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
38
+ rescue
39
+ ["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
40
+ end
41
+ end
42
+ end
43
+
44
+ def initialize
45
+ @input = ''
46
+ @words = []
47
+ @chinese_dictionary_path = chinese_dictionary_path
48
+ init_engines
49
+ init_filters
50
+ end
51
+
52
+ def remote_url
53
+ @remote_url ||= load_remote_url_config
54
+ end
55
+
56
+ def segment
57
+ @words = []
58
+ @input.chars.each do |origin|
59
+ char = filter(origin)
60
+ process(char, origin)
61
+ end
62
+
63
+ process(:symbol, '')
64
+ @words
65
+ end
66
+
67
+ private
68
+ def filter(char)
69
+ result = char
70
+ @filters.each do |klass|
71
+ result = klass.filter(result)
72
+ end
73
+ result
74
+ end
75
+
76
+ def process(char, origin)
77
+ nomatch = true
78
+ word = ''
79
+ @english_dictionary ||= load_english_dictionary(english_yaml_path)
80
+
81
+ engines.each do |engine|
82
+ next unless engine.running?
83
+ match, word = engine.process(char)
84
+ if match
85
+ nomatch = false
86
+ else
87
+ word = '' if engine.class == English && !@english_dictionary.include?(word)
88
+ engine.stop
89
+ end
90
+ end
91
+
92
+ if nomatch
93
+ if word == ''
94
+ # 没切出来的就当正常的词,不输出
95
+ # @words << origin unless char == :symbol
96
+ reset_engines
97
+ else
98
+ reset_engines
99
+ @words << word if word.is_a?(String) if word.size >= 2
100
+ # 我们只需要脏词完全匹配,不需要检查下文
101
+ # reprocess(word) if word.is_a?(Array)
102
+ # re-process current char
103
+ process(char, origin)
104
+ end
105
+ end
106
+ end
107
+
108
+ def reprocess(word)
109
+ last = word.pop
110
+
111
+ word.each do |char|
112
+ process(char, char)
113
+ end
114
+
115
+ process(:symbol, :symbol) # 把词加进来
116
+ process(last, last) # 继续分析词的最后一个字符
117
+ end
118
+
119
+ def reset_engines
120
+ engines.each do |engine|
121
+ engine.run
122
+ end
123
+ end
124
+
125
+ def engines=(engines)
126
+ @engines ||= engines
127
+ end
128
+
129
+ def engines
130
+ @engines
131
+ end
132
+
133
+ def init_filters
134
+ @filters = [Fullwidth, Symbol]
135
+ end
136
+
137
+ def init_engines
138
+ @engines ||= [Dict, English].map do |engine_klass|
139
+ if engine_klass == Dict
140
+ engine_klass.new do
141
+ @dict_path = @chinese_dictionary_path
142
+ end
143
+ else
144
+ engine_klass.new
145
+ end
146
+ end
147
+ end
148
+
149
+ def load_remote_url_config
150
+ YAML.load(File.read(File.join(Rails.root, 'config','harmonious_dictionary','remote_server.yml')))[Rails.env]['url']
151
+ end
152
+
153
+ def load_english_dictionary(path)
154
+ begin
155
+ YAML.load(File.read(path))
156
+ rescue => e
157
+ puts e
158
+ exit
159
+ end
160
+ end
161
+
162
+ def english_yaml_path
163
+ File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
164
+ end
165
+
166
+ def chinese_dictionary_path
167
+ File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
168
+ end
169
+ end
170
+ end