harmonious_dictionary 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/CHANGELOG +3 -0
  4. data/MIT-LICENSE +20 -0
  5. data/README.markdown +53 -0
  6. data/Rakefile +8 -0
  7. data/benchmark/benchmark.rb +43 -0
  8. data/benchmark/text_test_100.txt +4 -0
  9. data/benchmark/text_test_1000.txt +25 -0
  10. data/benchmark/text_test_10000.txt +219 -0
  11. data/bin/harmonious_rseg +11 -0
  12. data/bin/harmonious_server +63 -0
  13. data/harmonious_dictionary.gemspec +20 -0
  14. data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
  15. data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
  16. data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
  17. data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
  18. data/lib/harmonious_dictionary.rb +48 -0
  19. data/lib/harmonious_dictionary/app.rb +18 -0
  20. data/lib/harmonious_dictionary/engines/dict.rb +51 -0
  21. data/lib/harmonious_dictionary/engines/engine.rb +21 -0
  22. data/lib/harmonious_dictionary/engines/english.rb +27 -0
  23. data/lib/harmonious_dictionary/filters/conjunction.rb +11 -0
  24. data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
  25. data/lib/harmonious_dictionary/filters/symbol.rb +16 -0
  26. data/lib/harmonious_dictionary/model_additions.rb +15 -0
  27. data/lib/harmonious_dictionary/railtie.rb +23 -0
  28. data/lib/harmonious_dictionary/rseg.rb +170 -0
  29. data/lib/harmonious_dictionary/version.rb +3 -0
  30. data/lib/tasks/generate_dictionary.rake +55 -0
  31. data/spec/harmonious_dictionary_spec.rb +40 -0
  32. data/spec/model_additions_spec.rb +57 -0
  33. data/spec/spec_helper.rb +21 -0
  34. metadata +99 -0
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "harmonious_dictionary/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harmonious_dictionary"
7
+ s.version = HarmoniousDictionary::VERSION
8
+ s.authors = ["Stephen Kong"]
9
+ s.email = ["wear63659220@gmail.com"]
10
+ s.homepage = "https://github.com/wear/harmonious_dictionary"
11
+ s.summary = %q{filter any words that need to be harmonized}
12
+ s.description = %q{和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_development_dependency "rspec"
20
+ end
@@ -0,0 +1,16 @@
1
+ require "rails/generators"
2
+
3
+ module HarmoniousDictionary
4
+ module Generators
5
+ class SetupGenerator < ::Rails::Generators::Base
6
+ desc "This generator creates necessary at config/harmonious_dictionary"
7
+ source_root File.expand_path("../templates", __FILE__)
8
+
9
+ def generate_setup
10
+ #copy_file "remote_server.yml", "config/harmonious_dictionary/remote_server.yml"
11
+ copy_file "chinese_dictionary.txt", "config/harmonious_dictionary/chinese_dictionary.txt"
12
+ copy_file "english_dictionary.txt", "config/harmonious_dictionary/english_dictionary.txt"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ development:
2
+ url: 127.0.0.1:4100
3
+
4
+ test:
5
+ url: 127.0.0.1:4100
6
+
7
+ production:
8
+ url: 127.0.0.1:4100
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ require "harmonious_dictionary/rseg"
4
+ require "harmonious_dictionary/version"
5
+ require "harmonious_dictionary/model_additions"
6
+ require "harmonious_dictionary/railtie" if defined? Rails
7
+
8
+
9
+ module HarmoniousDictionary
10
+ def self.clean?(input)
11
+ results = HarmoniousDictionary::Rseg.segment(input)
12
+ results.size > 0 ? false : true
13
+ end
14
+
15
+ def self.clean_by_remote?(input)
16
+ results = HarmoniousDictionary::Rseg.remote_segment(input)
17
+ results.size > 0 ? false : true
18
+ end
19
+
20
+ def self.clean_by_remote(input)
21
+ results = HarmoniousDictionary::Rseg.remote_segment(input)
22
+ results.each do |result|
23
+ encode_result = result.force_encoding('utf-8')
24
+ input.gsub! /#{encode_result}/,self.clean_word_basic(encode_result)
25
+ end
26
+ input
27
+ end
28
+
29
+ def self.clean(input)
30
+ results = HarmoniousDictionary::Rseg.segment(input)
31
+ results.each{|result| input.gsub! /#{result}/,self.clean_word_basic(result) }
32
+ input
33
+ end
34
+
35
+ def self.harmonious_words(input)
36
+ return HarmoniousDictionary::Rseg.segment(input)
37
+ end
38
+
39
+ def self.clean_word_basic(word)
40
+ clearn_words = ""
41
+ word.size.times{ clearn_words << "*" }
42
+ clearn_words
43
+ end
44
+
45
+ def self.chinese_harmonious
46
+ Rseg.instance.send(:engines).first.dictionary
47
+ end
48
+ end
@@ -0,0 +1,18 @@
1
+ require 'sinatra/base'
2
+
3
+ module HarmoniousDictionary
4
+ class App < Sinatra::Base
5
+ set :root, File.dirname(__FILE__) + "/.."
6
+ set :app_file, __FILE__
7
+
8
+ post '/segment' do
9
+ @input = params[:input]
10
+ @result = HarmoniousDictionary::Rseg.segment(@input).join(' ')
11
+ end
12
+
13
+ post '/seg' do
14
+ @input = params[:input]
15
+ @result = HarmoniousDictionary::Rseg.segment(@input)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,51 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ class Dict < Engine
4
+ def initialize(&block)
5
+ @dict_path = block.call
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def dictionary
11
+ @@root
12
+ end
13
+
14
+ def process(char)
15
+ @root ||= load_dict(@dict_path)
16
+ @node ||= @root
17
+
18
+ match = false
19
+ word = nil
20
+
21
+ if @node[char]
22
+ @word << char
23
+ @node = @node[char]
24
+ match = true
25
+ else
26
+ if @node[:end] || @word.chars.to_a.length == 1
27
+ word = @word
28
+ else
29
+ word = @word.chars.to_a
30
+ end
31
+
32
+ @node = @root
33
+ @word = ''
34
+ match = false
35
+ end
36
+ [match, word]
37
+ end
38
+
39
+ private
40
+
41
+ def load_dict(path)
42
+ begin
43
+ File.open(path, "rb") {|io| Marshal.load(io) }
44
+ rescue => e
45
+ puts e
46
+ exit
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,21 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ class Engine
4
+ def initialize
5
+ @running = true
6
+ end
7
+
8
+ def stop
9
+ @running = false
10
+ end
11
+
12
+ def run
13
+ @running = true
14
+ end
15
+
16
+ def running?
17
+ @running
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ module HarmoniousDictionary
2
+ module RsegEngine
3
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
4
+
5
+ class English < Engine
6
+ def initialize
7
+ @word = ''
8
+ super
9
+ end
10
+
11
+ def process(char)
12
+ match = false
13
+ word = nil
14
+
15
+ if LETTER_SYMBOLS.include?(char)
16
+ @word << char
17
+ match = true
18
+ else
19
+ word = @word
20
+ @word = ''
21
+ match = false
22
+ end
23
+ [match, word]
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Conjunction
5
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
6
+
7
+ def self.filter(char)
8
+ @@conjunctions.include?(char) ? :conjunction : char
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Fullwidth
5
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
6
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
7
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
8
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
9
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
10
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
11
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
12
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
13
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
14
+
15
+ class << self
16
+ def filter(char)
17
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module RsegFilter
4
+ class Symbol
5
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
6
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
7
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
8
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
9
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
10
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
11
+ ' ', '-', '/', '+', ',', ' ']
12
+ def self.filter(char)
13
+ @@separators.include?(char) ? :symbol : char
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ module HarmoniousDictionary
4
+ module ModelAdditions
5
+ def validate_harmonious_of(*attr_names)
6
+ configuration = {message:'不能含有敏感词'}
7
+ configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
8
+ validates_each attr_names do |model, attribute, value|
9
+ unless value.blank?
10
+ model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module HarmoniousDictionary
2
+ class Railtie < Rails::Railtie
3
+ # config.harmonious_dictionary = ActiveSupport::OrderedOptions.new
4
+ # config.harmonious_dictionary.use_remote_server = false
5
+ # config.after_initialize do
6
+ # if config.harmonious_dictionary.use_remote_server
7
+ # Rseg.instance.load_remote_url_config
8
+ # end
9
+ # end
10
+
11
+ rake_tasks do
12
+ load "tasks/generate_dictionary.rake"
13
+ end
14
+
15
+ initializer 'HarmoniousDictionary.model_additions' do
16
+ ActiveSupport.on_load :active_record do
17
+ extend ModelAdditions
18
+ end
19
+ end
20
+
21
+
22
+ end
23
+ end
@@ -0,0 +1,170 @@
1
+ # encoding: utf-8
2
+
3
+ require 'singleton'
4
+ require 'net/http'
5
+ require 'yaml'
6
+
7
+ require File.join(File.dirname(__FILE__), 'engines/engine')
8
+ require File.join(File.dirname(__FILE__), 'engines/dict')
9
+ require File.join(File.dirname(__FILE__), 'engines/english')
10
+
11
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
12
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
13
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
14
+
15
+ module HarmoniousDictionary
16
+ class Rseg
17
+ include Singleton
18
+ include RsegEngine
19
+ include RsegFilter
20
+ attr_writer :input
21
+
22
+ class << self
23
+ def segment(input)
24
+ HarmoniousDictionary::Rseg.instance.input = input
25
+ HarmoniousDictionary::Rseg.instance.segment
26
+ end
27
+
28
+ def load(dict)
29
+ HarmoniousDictionary::Rseg.instance
30
+ nil
31
+ end
32
+
33
+ def remote_segment(input)
34
+ begin
35
+ response = Net::HTTP.post_form(URI.parse("http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg"), :input => input)
36
+ response.code == '200' ? response.body.split(' ') :
37
+ ["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
38
+ rescue
39
+ ["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
40
+ end
41
+ end
42
+ end
43
+
44
+ def initialize
45
+ @input = ''
46
+ @words = []
47
+ @chinese_dictionary_path = chinese_dictionary_path
48
+ init_engines
49
+ init_filters
50
+ end
51
+
52
+ def remote_url
53
+ @remote_url ||= load_remote_url_config
54
+ end
55
+
56
+ def segment
57
+ @words = []
58
+ @input.chars.each do |origin|
59
+ char = filter(origin)
60
+ process(char, origin)
61
+ end
62
+
63
+ process(:symbol, '')
64
+ @words
65
+ end
66
+
67
+ private
68
+ def filter(char)
69
+ result = char
70
+ @filters.each do |klass|
71
+ result = klass.filter(result)
72
+ end
73
+ result
74
+ end
75
+
76
+ def process(char, origin)
77
+ nomatch = true
78
+ word = ''
79
+ @english_dictionary ||= load_english_dictionary(english_yaml_path)
80
+
81
+ engines.each do |engine|
82
+ next unless engine.running?
83
+ match, word = engine.process(char)
84
+ if match
85
+ nomatch = false
86
+ else
87
+ word = '' if engine.class == English && !@english_dictionary.include?(word)
88
+ engine.stop
89
+ end
90
+ end
91
+
92
+ if nomatch
93
+ if word == ''
94
+ # 没切出来的就当正常的词,不输出
95
+ # @words << origin unless char == :symbol
96
+ reset_engines
97
+ else
98
+ reset_engines
99
+ @words << word if word.is_a?(String) if word.size >= 2
100
+ # 我们只需要脏词完全匹配,不需要检查下文
101
+ # reprocess(word) if word.is_a?(Array)
102
+ # re-process current char
103
+ process(char, origin)
104
+ end
105
+ end
106
+ end
107
+
108
+ def reprocess(word)
109
+ last = word.pop
110
+
111
+ word.each do |char|
112
+ process(char, char)
113
+ end
114
+
115
+ process(:symbol, :symbol) # 把词加进来
116
+ process(last, last) # 继续分析词的最后一个字符
117
+ end
118
+
119
+ def reset_engines
120
+ engines.each do |engine|
121
+ engine.run
122
+ end
123
+ end
124
+
125
+ def engines=(engines)
126
+ @engines ||= engines
127
+ end
128
+
129
+ def engines
130
+ @engines
131
+ end
132
+
133
+ def init_filters
134
+ @filters = [Fullwidth, Symbol]
135
+ end
136
+
137
+ def init_engines
138
+ @engines ||= [Dict, English].map do |engine_klass|
139
+ if engine_klass == Dict
140
+ engine_klass.new do
141
+ @dict_path = @chinese_dictionary_path
142
+ end
143
+ else
144
+ engine_klass.new
145
+ end
146
+ end
147
+ end
148
+
149
+ def load_remote_url_config
150
+ YAML.load(File.read(File.join(Rails.root, 'config','harmonious_dictionary','remote_server.yml')))[Rails.env]['url']
151
+ end
152
+
153
+ def load_english_dictionary(path)
154
+ begin
155
+ YAML.load(File.read(path))
156
+ rescue => e
157
+ puts e
158
+ exit
159
+ end
160
+ end
161
+
162
+ def english_yaml_path
163
+ File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
164
+ end
165
+
166
+ def chinese_dictionary_path
167
+ File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
168
+ end
169
+ end
170
+ end