harmonious_dictionary 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/CHANGELOG +3 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +53 -0
- data/Rakefile +8 -0
- data/benchmark/benchmark.rb +43 -0
- data/benchmark/text_test_100.txt +4 -0
- data/benchmark/text_test_1000.txt +25 -0
- data/benchmark/text_test_10000.txt +219 -0
- data/bin/harmonious_rseg +11 -0
- data/bin/harmonious_server +63 -0
- data/harmonious_dictionary.gemspec +20 -0
- data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
- data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
- data/lib/harmonious_dictionary.rb +48 -0
- data/lib/harmonious_dictionary/app.rb +18 -0
- data/lib/harmonious_dictionary/engines/dict.rb +51 -0
- data/lib/harmonious_dictionary/engines/engine.rb +21 -0
- data/lib/harmonious_dictionary/engines/english.rb +27 -0
- data/lib/harmonious_dictionary/filters/conjunction.rb +11 -0
- data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
- data/lib/harmonious_dictionary/filters/symbol.rb +16 -0
- data/lib/harmonious_dictionary/model_additions.rb +15 -0
- data/lib/harmonious_dictionary/railtie.rb +23 -0
- data/lib/harmonious_dictionary/rseg.rb +170 -0
- data/lib/harmonious_dictionary/version.rb +3 -0
- data/lib/tasks/generate_dictionary.rake +55 -0
- data/spec/harmonious_dictionary_spec.rb +40 -0
- data/spec/model_additions_spec.rb +57 -0
- data/spec/spec_helper.rb +21 -0
- metadata +99 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "harmonious_dictionary/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harmonious_dictionary"
|
7
|
+
s.version = HarmoniousDictionary::VERSION
|
8
|
+
s.authors = ["Stephen Kong"]
|
9
|
+
s.email = ["wear63659220@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/wear/harmonious_dictionary"
|
11
|
+
s.summary = %q{filter any words that need to be harmonized}
|
12
|
+
s.description = %q{和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_development_dependency "rspec"
|
20
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "rails/generators"
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
module Generators
|
5
|
+
class SetupGenerator < ::Rails::Generators::Base
|
6
|
+
desc "This generator creates necessary at config/harmonious_dictionary"
|
7
|
+
source_root File.expand_path("../templates", __FILE__)
|
8
|
+
|
9
|
+
def generate_setup
|
10
|
+
#copy_file "remote_server.yml", "config/harmonious_dictionary/remote_server.yml"
|
11
|
+
copy_file "chinese_dictionary.txt", "config/harmonious_dictionary/chinese_dictionary.txt"
|
12
|
+
copy_file "english_dictionary.txt", "config/harmonious_dictionary/english_dictionary.txt"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
File without changes
|
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "harmonious_dictionary/rseg"
|
4
|
+
require "harmonious_dictionary/version"
|
5
|
+
require "harmonious_dictionary/model_additions"
|
6
|
+
require "harmonious_dictionary/railtie" if defined? Rails
|
7
|
+
|
8
|
+
|
9
|
+
module HarmoniousDictionary
|
10
|
+
def self.clean?(input)
|
11
|
+
results = HarmoniousDictionary::Rseg.segment(input)
|
12
|
+
results.size > 0 ? false : true
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.clean_by_remote?(input)
|
16
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
17
|
+
results.size > 0 ? false : true
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.clean_by_remote(input)
|
21
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
22
|
+
results.each do |result|
|
23
|
+
encode_result = result.force_encoding('utf-8')
|
24
|
+
input.gsub! /#{encode_result}/,self.clean_word_basic(encode_result)
|
25
|
+
end
|
26
|
+
input
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.clean(input)
|
30
|
+
results = HarmoniousDictionary::Rseg.segment(input)
|
31
|
+
results.each{|result| input.gsub! /#{result}/,self.clean_word_basic(result) }
|
32
|
+
input
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.harmonious_words(input)
|
36
|
+
return HarmoniousDictionary::Rseg.segment(input)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.clean_word_basic(word)
|
40
|
+
clearn_words = ""
|
41
|
+
word.size.times{ clearn_words << "*" }
|
42
|
+
clearn_words
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.chinese_harmonious
|
46
|
+
Rseg.instance.send(:engines).first.dictionary
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
class App < Sinatra::Base
|
5
|
+
set :root, File.dirname(__FILE__) + "/.."
|
6
|
+
set :app_file, __FILE__
|
7
|
+
|
8
|
+
post '/segment' do
|
9
|
+
@input = params[:input]
|
10
|
+
@result = HarmoniousDictionary::Rseg.segment(@input).join(' ')
|
11
|
+
end
|
12
|
+
|
13
|
+
post '/seg' do
|
14
|
+
@input = params[:input]
|
15
|
+
@result = HarmoniousDictionary::Rseg.segment(@input)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
class Dict < Engine
|
4
|
+
def initialize(&block)
|
5
|
+
@dict_path = block.call
|
6
|
+
@word = ''
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def dictionary
|
11
|
+
@@root
|
12
|
+
end
|
13
|
+
|
14
|
+
def process(char)
|
15
|
+
@root ||= load_dict(@dict_path)
|
16
|
+
@node ||= @root
|
17
|
+
|
18
|
+
match = false
|
19
|
+
word = nil
|
20
|
+
|
21
|
+
if @node[char]
|
22
|
+
@word << char
|
23
|
+
@node = @node[char]
|
24
|
+
match = true
|
25
|
+
else
|
26
|
+
if @node[:end] || @word.chars.to_a.length == 1
|
27
|
+
word = @word
|
28
|
+
else
|
29
|
+
word = @word.chars.to_a
|
30
|
+
end
|
31
|
+
|
32
|
+
@node = @root
|
33
|
+
@word = ''
|
34
|
+
match = false
|
35
|
+
end
|
36
|
+
[match, word]
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def load_dict(path)
|
42
|
+
begin
|
43
|
+
File.open(path, "rb") {|io| Marshal.load(io) }
|
44
|
+
rescue => e
|
45
|
+
puts e
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
class Engine
|
4
|
+
def initialize
|
5
|
+
@running = true
|
6
|
+
end
|
7
|
+
|
8
|
+
def stop
|
9
|
+
@running = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
@running = true
|
14
|
+
end
|
15
|
+
|
16
|
+
def running?
|
17
|
+
@running
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
|
4
|
+
|
5
|
+
class English < Engine
|
6
|
+
def initialize
|
7
|
+
@word = ''
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def process(char)
|
12
|
+
match = false
|
13
|
+
word = nil
|
14
|
+
|
15
|
+
if LETTER_SYMBOLS.include?(char)
|
16
|
+
@word << char
|
17
|
+
match = true
|
18
|
+
else
|
19
|
+
word = @word
|
20
|
+
@word = ''
|
21
|
+
match = false
|
22
|
+
end
|
23
|
+
[match, word]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RsegFilter
|
4
|
+
class Fullwidth
|
5
|
+
@@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
|
6
|
+
'9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
|
7
|
+
'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
|
8
|
+
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
|
9
|
+
'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
|
10
|
+
'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
|
11
|
+
'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
|
12
|
+
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
|
13
|
+
'—' => '-', ',' => ',', '/' => '/', '·' => '.'}
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def filter(char)
|
17
|
+
@@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RsegFilter
|
4
|
+
class Symbol
|
5
|
+
@@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
|
6
|
+
'《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
|
7
|
+
'~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
|
8
|
+
'【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
|
9
|
+
'&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
|
10
|
+
'|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
|
11
|
+
' ', '-', '/', '+', ',', ' ']
|
12
|
+
def self.filter(char)
|
13
|
+
@@separators.include?(char) ? :symbol : char
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
module ModelAdditions
|
5
|
+
def validate_harmonious_of(*attr_names)
|
6
|
+
configuration = {message:'不能含有敏感词'}
|
7
|
+
configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
|
8
|
+
validates_each attr_names do |model, attribute, value|
|
9
|
+
unless value.blank?
|
10
|
+
model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
class Railtie < Rails::Railtie
|
3
|
+
# config.harmonious_dictionary = ActiveSupport::OrderedOptions.new
|
4
|
+
# config.harmonious_dictionary.use_remote_server = false
|
5
|
+
# config.after_initialize do
|
6
|
+
# if config.harmonious_dictionary.use_remote_server
|
7
|
+
# Rseg.instance.load_remote_url_config
|
8
|
+
# end
|
9
|
+
# end
|
10
|
+
|
11
|
+
rake_tasks do
|
12
|
+
load "tasks/generate_dictionary.rake"
|
13
|
+
end
|
14
|
+
|
15
|
+
initializer 'HarmoniousDictionary.model_additions' do
|
16
|
+
ActiveSupport.on_load :active_record do
|
17
|
+
extend ModelAdditions
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'net/http'
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
require File.join(File.dirname(__FILE__), 'engines/engine')
|
8
|
+
require File.join(File.dirname(__FILE__), 'engines/dict')
|
9
|
+
require File.join(File.dirname(__FILE__), 'engines/english')
|
10
|
+
|
11
|
+
require File.join(File.dirname(__FILE__), 'filters/fullwidth')
|
12
|
+
require File.join(File.dirname(__FILE__), 'filters/symbol')
|
13
|
+
require File.join(File.dirname(__FILE__), 'filters/conjunction')
|
14
|
+
|
15
|
+
module HarmoniousDictionary
|
16
|
+
class Rseg
|
17
|
+
include Singleton
|
18
|
+
include RsegEngine
|
19
|
+
include RsegFilter
|
20
|
+
attr_writer :input
|
21
|
+
|
22
|
+
class << self
|
23
|
+
def segment(input)
|
24
|
+
HarmoniousDictionary::Rseg.instance.input = input
|
25
|
+
HarmoniousDictionary::Rseg.instance.segment
|
26
|
+
end
|
27
|
+
|
28
|
+
def load(dict)
|
29
|
+
HarmoniousDictionary::Rseg.instance
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
|
33
|
+
def remote_segment(input)
|
34
|
+
begin
|
35
|
+
response = Net::HTTP.post_form(URI.parse("http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg"), :input => input)
|
36
|
+
response.code == '200' ? response.body.split(' ') :
|
37
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
38
|
+
rescue
|
39
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
@input = ''
|
46
|
+
@words = []
|
47
|
+
@chinese_dictionary_path = chinese_dictionary_path
|
48
|
+
init_engines
|
49
|
+
init_filters
|
50
|
+
end
|
51
|
+
|
52
|
+
def remote_url
|
53
|
+
@remote_url ||= load_remote_url_config
|
54
|
+
end
|
55
|
+
|
56
|
+
def segment
|
57
|
+
@words = []
|
58
|
+
@input.chars.each do |origin|
|
59
|
+
char = filter(origin)
|
60
|
+
process(char, origin)
|
61
|
+
end
|
62
|
+
|
63
|
+
process(:symbol, '')
|
64
|
+
@words
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def filter(char)
|
69
|
+
result = char
|
70
|
+
@filters.each do |klass|
|
71
|
+
result = klass.filter(result)
|
72
|
+
end
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def process(char, origin)
|
77
|
+
nomatch = true
|
78
|
+
word = ''
|
79
|
+
@english_dictionary ||= load_english_dictionary(english_yaml_path)
|
80
|
+
|
81
|
+
engines.each do |engine|
|
82
|
+
next unless engine.running?
|
83
|
+
match, word = engine.process(char)
|
84
|
+
if match
|
85
|
+
nomatch = false
|
86
|
+
else
|
87
|
+
word = '' if engine.class == English && !@english_dictionary.include?(word)
|
88
|
+
engine.stop
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
if nomatch
|
93
|
+
if word == ''
|
94
|
+
# 没切出来的就当正常的词,不输出
|
95
|
+
# @words << origin unless char == :symbol
|
96
|
+
reset_engines
|
97
|
+
else
|
98
|
+
reset_engines
|
99
|
+
@words << word if word.is_a?(String) if word.size >= 2
|
100
|
+
# 我们只需要脏词完全匹配,不需要检查下文
|
101
|
+
# reprocess(word) if word.is_a?(Array)
|
102
|
+
# re-process current char
|
103
|
+
process(char, origin)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def reprocess(word)
|
109
|
+
last = word.pop
|
110
|
+
|
111
|
+
word.each do |char|
|
112
|
+
process(char, char)
|
113
|
+
end
|
114
|
+
|
115
|
+
process(:symbol, :symbol) # 把词加进来
|
116
|
+
process(last, last) # 继续分析词的最后一个字符
|
117
|
+
end
|
118
|
+
|
119
|
+
def reset_engines
|
120
|
+
engines.each do |engine|
|
121
|
+
engine.run
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def engines=(engines)
|
126
|
+
@engines ||= engines
|
127
|
+
end
|
128
|
+
|
129
|
+
def engines
|
130
|
+
@engines
|
131
|
+
end
|
132
|
+
|
133
|
+
def init_filters
|
134
|
+
@filters = [Fullwidth, Symbol]
|
135
|
+
end
|
136
|
+
|
137
|
+
def init_engines
|
138
|
+
@engines ||= [Dict, English].map do |engine_klass|
|
139
|
+
if engine_klass == Dict
|
140
|
+
engine_klass.new do
|
141
|
+
@dict_path = @chinese_dictionary_path
|
142
|
+
end
|
143
|
+
else
|
144
|
+
engine_klass.new
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def load_remote_url_config
|
150
|
+
YAML.load(File.read(File.join(Rails.root, 'config','harmonious_dictionary','remote_server.yml')))[Rails.env]['url']
|
151
|
+
end
|
152
|
+
|
153
|
+
def load_english_dictionary(path)
|
154
|
+
begin
|
155
|
+
YAML.load(File.read(path))
|
156
|
+
rescue => e
|
157
|
+
puts e
|
158
|
+
exit
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def english_yaml_path
|
163
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
|
164
|
+
end
|
165
|
+
|
166
|
+
def chinese_dictionary_path
|
167
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|