harmonious_dictionary 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/CHANGELOG +3 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +53 -0
- data/Rakefile +8 -0
- data/benchmark/benchmark.rb +43 -0
- data/benchmark/text_test_100.txt +4 -0
- data/benchmark/text_test_1000.txt +25 -0
- data/benchmark/text_test_10000.txt +219 -0
- data/bin/harmonious_rseg +11 -0
- data/bin/harmonious_server +63 -0
- data/harmonious_dictionary.gemspec +20 -0
- data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
- data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
- data/lib/harmonious_dictionary.rb +48 -0
- data/lib/harmonious_dictionary/app.rb +18 -0
- data/lib/harmonious_dictionary/engines/dict.rb +51 -0
- data/lib/harmonious_dictionary/engines/engine.rb +21 -0
- data/lib/harmonious_dictionary/engines/english.rb +27 -0
- data/lib/harmonious_dictionary/filters/conjunction.rb +11 -0
- data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
- data/lib/harmonious_dictionary/filters/symbol.rb +16 -0
- data/lib/harmonious_dictionary/model_additions.rb +15 -0
- data/lib/harmonious_dictionary/railtie.rb +23 -0
- data/lib/harmonious_dictionary/rseg.rb +170 -0
- data/lib/harmonious_dictionary/version.rb +3 -0
- data/lib/tasks/generate_dictionary.rake +55 -0
- data/spec/harmonious_dictionary_spec.rb +40 -0
- data/spec/model_additions_spec.rb +57 -0
- data/spec/spec_helper.rb +21 -0
- metadata +99 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "harmonious_dictionary/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harmonious_dictionary"
|
7
|
+
s.version = HarmoniousDictionary::VERSION
|
8
|
+
s.authors = ["Stephen Kong"]
|
9
|
+
s.email = ["wear63659220@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/wear/harmonious_dictionary"
|
11
|
+
s.summary = %q{filter any words that need to be harmonized}
|
12
|
+
s.description = %q{和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_development_dependency "rspec"
|
20
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "rails/generators"
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
module Generators
|
5
|
+
class SetupGenerator < ::Rails::Generators::Base
|
6
|
+
desc "This generator creates necessary at config/harmonious_dictionary"
|
7
|
+
source_root File.expand_path("../templates", __FILE__)
|
8
|
+
|
9
|
+
def generate_setup
|
10
|
+
#copy_file "remote_server.yml", "config/harmonious_dictionary/remote_server.yml"
|
11
|
+
copy_file "chinese_dictionary.txt", "config/harmonious_dictionary/chinese_dictionary.txt"
|
12
|
+
copy_file "english_dictionary.txt", "config/harmonious_dictionary/english_dictionary.txt"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
File without changes
|
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "harmonious_dictionary/rseg"
|
4
|
+
require "harmonious_dictionary/version"
|
5
|
+
require "harmonious_dictionary/model_additions"
|
6
|
+
require "harmonious_dictionary/railtie" if defined? Rails
|
7
|
+
|
8
|
+
|
9
|
+
module HarmoniousDictionary
|
10
|
+
def self.clean?(input)
|
11
|
+
results = HarmoniousDictionary::Rseg.segment(input)
|
12
|
+
results.size > 0 ? false : true
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.clean_by_remote?(input)
|
16
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
17
|
+
results.size > 0 ? false : true
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.clean_by_remote(input)
|
21
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
22
|
+
results.each do |result|
|
23
|
+
encode_result = result.force_encoding('utf-8')
|
24
|
+
input.gsub! /#{encode_result}/,self.clean_word_basic(encode_result)
|
25
|
+
end
|
26
|
+
input
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.clean(input)
|
30
|
+
results = HarmoniousDictionary::Rseg.segment(input)
|
31
|
+
results.each{|result| input.gsub! /#{result}/,self.clean_word_basic(result) }
|
32
|
+
input
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.harmonious_words(input)
|
36
|
+
return HarmoniousDictionary::Rseg.segment(input)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.clean_word_basic(word)
|
40
|
+
clearn_words = ""
|
41
|
+
word.size.times{ clearn_words << "*" }
|
42
|
+
clearn_words
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.chinese_harmonious
|
46
|
+
Rseg.instance.send(:engines).first.dictionary
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
class App < Sinatra::Base
|
5
|
+
set :root, File.dirname(__FILE__) + "/.."
|
6
|
+
set :app_file, __FILE__
|
7
|
+
|
8
|
+
post '/segment' do
|
9
|
+
@input = params[:input]
|
10
|
+
@result = HarmoniousDictionary::Rseg.segment(@input).join(' ')
|
11
|
+
end
|
12
|
+
|
13
|
+
post '/seg' do
|
14
|
+
@input = params[:input]
|
15
|
+
@result = HarmoniousDictionary::Rseg.segment(@input)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
class Dict < Engine
|
4
|
+
def initialize(&block)
|
5
|
+
@dict_path = block.call
|
6
|
+
@word = ''
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def dictionary
|
11
|
+
@@root
|
12
|
+
end
|
13
|
+
|
14
|
+
def process(char)
|
15
|
+
@root ||= load_dict(@dict_path)
|
16
|
+
@node ||= @root
|
17
|
+
|
18
|
+
match = false
|
19
|
+
word = nil
|
20
|
+
|
21
|
+
if @node[char]
|
22
|
+
@word << char
|
23
|
+
@node = @node[char]
|
24
|
+
match = true
|
25
|
+
else
|
26
|
+
if @node[:end] || @word.chars.to_a.length == 1
|
27
|
+
word = @word
|
28
|
+
else
|
29
|
+
word = @word.chars.to_a
|
30
|
+
end
|
31
|
+
|
32
|
+
@node = @root
|
33
|
+
@word = ''
|
34
|
+
match = false
|
35
|
+
end
|
36
|
+
[match, word]
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def load_dict(path)
|
42
|
+
begin
|
43
|
+
File.open(path, "rb") {|io| Marshal.load(io) }
|
44
|
+
rescue => e
|
45
|
+
puts e
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
class Engine
|
4
|
+
def initialize
|
5
|
+
@running = true
|
6
|
+
end
|
7
|
+
|
8
|
+
def stop
|
9
|
+
@running = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
@running = true
|
14
|
+
end
|
15
|
+
|
16
|
+
def running?
|
17
|
+
@running
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
module RsegEngine
|
3
|
+
LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
|
4
|
+
|
5
|
+
class English < Engine
|
6
|
+
def initialize
|
7
|
+
@word = ''
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def process(char)
|
12
|
+
match = false
|
13
|
+
word = nil
|
14
|
+
|
15
|
+
if LETTER_SYMBOLS.include?(char)
|
16
|
+
@word << char
|
17
|
+
match = true
|
18
|
+
else
|
19
|
+
word = @word
|
20
|
+
@word = ''
|
21
|
+
match = false
|
22
|
+
end
|
23
|
+
[match, word]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RsegFilter
|
4
|
+
class Fullwidth
|
5
|
+
@@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
|
6
|
+
'9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
|
7
|
+
'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
|
8
|
+
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
|
9
|
+
'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
|
10
|
+
'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
|
11
|
+
'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
|
12
|
+
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
|
13
|
+
'—' => '-', ',' => ',', '/' => '/', '·' => '.'}
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def filter(char)
|
17
|
+
@@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RsegFilter
|
4
|
+
class Symbol
|
5
|
+
@@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
|
6
|
+
'《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
|
7
|
+
'~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
|
8
|
+
'【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
|
9
|
+
'&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
|
10
|
+
'|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
|
11
|
+
' ', '-', '/', '+', ',', ' ']
|
12
|
+
def self.filter(char)
|
13
|
+
@@separators.include?(char) ? :symbol : char
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module HarmoniousDictionary
|
4
|
+
module ModelAdditions
|
5
|
+
def validate_harmonious_of(*attr_names)
|
6
|
+
configuration = {message:'不能含有敏感词'}
|
7
|
+
configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
|
8
|
+
validates_each attr_names do |model, attribute, value|
|
9
|
+
unless value.blank?
|
10
|
+
model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module HarmoniousDictionary
|
2
|
+
class Railtie < Rails::Railtie
|
3
|
+
# config.harmonious_dictionary = ActiveSupport::OrderedOptions.new
|
4
|
+
# config.harmonious_dictionary.use_remote_server = false
|
5
|
+
# config.after_initialize do
|
6
|
+
# if config.harmonious_dictionary.use_remote_server
|
7
|
+
# Rseg.instance.load_remote_url_config
|
8
|
+
# end
|
9
|
+
# end
|
10
|
+
|
11
|
+
rake_tasks do
|
12
|
+
load "tasks/generate_dictionary.rake"
|
13
|
+
end
|
14
|
+
|
15
|
+
initializer 'HarmoniousDictionary.model_additions' do
|
16
|
+
ActiveSupport.on_load :active_record do
|
17
|
+
extend ModelAdditions
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'net/http'
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
require File.join(File.dirname(__FILE__), 'engines/engine')
|
8
|
+
require File.join(File.dirname(__FILE__), 'engines/dict')
|
9
|
+
require File.join(File.dirname(__FILE__), 'engines/english')
|
10
|
+
|
11
|
+
require File.join(File.dirname(__FILE__), 'filters/fullwidth')
|
12
|
+
require File.join(File.dirname(__FILE__), 'filters/symbol')
|
13
|
+
require File.join(File.dirname(__FILE__), 'filters/conjunction')
|
14
|
+
|
15
|
+
module HarmoniousDictionary
|
16
|
+
class Rseg
|
17
|
+
include Singleton
|
18
|
+
include RsegEngine
|
19
|
+
include RsegFilter
|
20
|
+
attr_writer :input
|
21
|
+
|
22
|
+
class << self
|
23
|
+
def segment(input)
|
24
|
+
HarmoniousDictionary::Rseg.instance.input = input
|
25
|
+
HarmoniousDictionary::Rseg.instance.segment
|
26
|
+
end
|
27
|
+
|
28
|
+
def load(dict)
|
29
|
+
HarmoniousDictionary::Rseg.instance
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
|
33
|
+
def remote_segment(input)
|
34
|
+
begin
|
35
|
+
response = Net::HTTP.post_form(URI.parse("http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg"), :input => input)
|
36
|
+
response.code == '200' ? response.body.split(' ') :
|
37
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
38
|
+
rescue
|
39
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
@input = ''
|
46
|
+
@words = []
|
47
|
+
@chinese_dictionary_path = chinese_dictionary_path
|
48
|
+
init_engines
|
49
|
+
init_filters
|
50
|
+
end
|
51
|
+
|
52
|
+
def remote_url
|
53
|
+
@remote_url ||= load_remote_url_config
|
54
|
+
end
|
55
|
+
|
56
|
+
def segment
|
57
|
+
@words = []
|
58
|
+
@input.chars.each do |origin|
|
59
|
+
char = filter(origin)
|
60
|
+
process(char, origin)
|
61
|
+
end
|
62
|
+
|
63
|
+
process(:symbol, '')
|
64
|
+
@words
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def filter(char)
|
69
|
+
result = char
|
70
|
+
@filters.each do |klass|
|
71
|
+
result = klass.filter(result)
|
72
|
+
end
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def process(char, origin)
|
77
|
+
nomatch = true
|
78
|
+
word = ''
|
79
|
+
@english_dictionary ||= load_english_dictionary(english_yaml_path)
|
80
|
+
|
81
|
+
engines.each do |engine|
|
82
|
+
next unless engine.running?
|
83
|
+
match, word = engine.process(char)
|
84
|
+
if match
|
85
|
+
nomatch = false
|
86
|
+
else
|
87
|
+
word = '' if engine.class == English && !@english_dictionary.include?(word)
|
88
|
+
engine.stop
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
if nomatch
|
93
|
+
if word == ''
|
94
|
+
# 没切出来的就当正常的词,不输出
|
95
|
+
# @words << origin unless char == :symbol
|
96
|
+
reset_engines
|
97
|
+
else
|
98
|
+
reset_engines
|
99
|
+
@words << word if word.is_a?(String) if word.size >= 2
|
100
|
+
# 我们只需要脏词完全匹配,不需要检查下文
|
101
|
+
# reprocess(word) if word.is_a?(Array)
|
102
|
+
# re-process current char
|
103
|
+
process(char, origin)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def reprocess(word)
|
109
|
+
last = word.pop
|
110
|
+
|
111
|
+
word.each do |char|
|
112
|
+
process(char, char)
|
113
|
+
end
|
114
|
+
|
115
|
+
process(:symbol, :symbol) # 把词加进来
|
116
|
+
process(last, last) # 继续分析词的最后一个字符
|
117
|
+
end
|
118
|
+
|
119
|
+
def reset_engines
|
120
|
+
engines.each do |engine|
|
121
|
+
engine.run
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def engines=(engines)
|
126
|
+
@engines ||= engines
|
127
|
+
end
|
128
|
+
|
129
|
+
def engines
|
130
|
+
@engines
|
131
|
+
end
|
132
|
+
|
133
|
+
def init_filters
|
134
|
+
@filters = [Fullwidth, Symbol]
|
135
|
+
end
|
136
|
+
|
137
|
+
def init_engines
|
138
|
+
@engines ||= [Dict, English].map do |engine_klass|
|
139
|
+
if engine_klass == Dict
|
140
|
+
engine_klass.new do
|
141
|
+
@dict_path = @chinese_dictionary_path
|
142
|
+
end
|
143
|
+
else
|
144
|
+
engine_klass.new
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def load_remote_url_config
|
150
|
+
YAML.load(File.read(File.join(Rails.root, 'config','harmonious_dictionary','remote_server.yml')))[Rails.env]['url']
|
151
|
+
end
|
152
|
+
|
153
|
+
def load_english_dictionary(path)
|
154
|
+
begin
|
155
|
+
YAML.load(File.read(path))
|
156
|
+
rescue => e
|
157
|
+
puts e
|
158
|
+
exit
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def english_yaml_path
|
163
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
|
164
|
+
end
|
165
|
+
|
166
|
+
def chinese_dictionary_path
|
167
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|