harmonious_check 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/.travis.yml +4 -0
- data/CHANGELOG +3 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +84 -0
- data/Rakefile +8 -0
- data/benchmark/benchmark.rb +43 -0
- data/benchmark/text_test_100.txt +4 -0
- data/benchmark/text_test_1000.txt +25 -0
- data/benchmark/text_test_10000.txt +219 -0
- data/bin/harmonious_rseg +11 -0
- data/bin/harmonious_server +63 -0
- data/harmonious_dictionary.gemspec +22 -0
- data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
- data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
- data/lib/harmonious_dictionary/app.rb +18 -0
- data/lib/harmonious_dictionary/engines/dict.rb +51 -0
- data/lib/harmonious_dictionary/engines/engine.rb +21 -0
- data/lib/harmonious_dictionary/engines/english.rb +29 -0
- data/lib/harmonious_dictionary/filters/conjunction.rb +13 -0
- data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
- data/lib/harmonious_dictionary/filters/symbol.rb +18 -0
- data/lib/harmonious_dictionary/model_additions.rb +20 -0
- data/lib/harmonious_dictionary/railtie.rb +23 -0
- data/lib/harmonious_dictionary/rseg.rb +185 -0
- data/lib/harmonious_dictionary/version.rb +3 -0
- data/lib/harmonious_dictionary.rb +48 -0
- data/lib/tasks/generate_dictionary.rake +63 -0
- data/spec/harmonious_dictionary_spec.rb +61 -0
- data/spec/model_additions_spec.rb +57 -0
- data/spec/spec_helper.rb +21 -0
- metadata +109 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
|
3
|
+
require "harmonious_dictionary/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = "harmonious_check"
|
|
7
|
+
s.version = HarmoniousDictionary::VERSION
|
|
8
|
+
s.authors = ["amelia9f"]
|
|
9
|
+
s.email = ["amelia9fpyjo@gmx.com"]
|
|
10
|
+
s.homepage = "https://github.com/amelia9f/harmonious_check"
|
|
11
|
+
s.summary = %q{filter any words that need to be harmonized}
|
|
12
|
+
s.description = %q{和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。}
|
|
13
|
+
|
|
14
|
+
s.files = `git ls-files`.split("\n")
|
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
17
|
+
s.require_paths = ["lib"]
|
|
18
|
+
|
|
19
|
+
s.add_development_dependency "rake"
|
|
20
|
+
s.add_development_dependency "rspec"
|
|
21
|
+
|
|
22
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require "rails/generators"
|
|
2
|
+
|
|
3
|
+
module HarmoniousDictionary
|
|
4
|
+
module Generators
|
|
5
|
+
class SetupGenerator < ::Rails::Generators::Base
|
|
6
|
+
desc "This generator creates necessary at config/harmonious_dictionary"
|
|
7
|
+
source_root File.expand_path("../templates", __FILE__)
|
|
8
|
+
|
|
9
|
+
def generate_setup
|
|
10
|
+
#copy_file "remote_server.yml", "config/harmonious_dictionary/remote_server.yml"
|
|
11
|
+
copy_file "chinese_dictionary.txt", "config/harmonious_dictionary/chinese_dictionary.txt"
|
|
12
|
+
copy_file "english_dictionary.txt", "config/harmonious_dictionary/english_dictionary.txt"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require 'sinatra/base'
|
|
2
|
+
|
|
3
|
+
module HarmoniousDictionary
|
|
4
|
+
class App < Sinatra::Base
|
|
5
|
+
set :root, File.dirname(__FILE__) + "/.."
|
|
6
|
+
set :app_file, __FILE__
|
|
7
|
+
|
|
8
|
+
post '/segment' do
|
|
9
|
+
@input = params[:input]
|
|
10
|
+
@result = HarmoniousDictionary::Rseg.segment(@input).join(' ')
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
post '/seg' do
|
|
14
|
+
@input = params[:input]
|
|
15
|
+
@result = HarmoniousDictionary::Rseg.segment(@input)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
module HarmoniousDictionary
|
|
2
|
+
module RsegEngine
|
|
3
|
+
class Dict < Engine
|
|
4
|
+
def initialize(&block)
|
|
5
|
+
@dict_path = block.call
|
|
6
|
+
@word = ''
|
|
7
|
+
super
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def dictionary
|
|
11
|
+
@@root
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def process(char)
|
|
15
|
+
@root ||= load_dict(@dict_path)
|
|
16
|
+
@node ||= @root
|
|
17
|
+
|
|
18
|
+
match = false
|
|
19
|
+
word = nil
|
|
20
|
+
|
|
21
|
+
if @node[char]
|
|
22
|
+
@word << char
|
|
23
|
+
@node = @node[char]
|
|
24
|
+
match = true
|
|
25
|
+
else
|
|
26
|
+
if @node[:end] || @word.chars.to_a.length == 1
|
|
27
|
+
word = @word
|
|
28
|
+
else
|
|
29
|
+
word = @word.chars.to_a
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
@node = @root
|
|
33
|
+
@word = ''
|
|
34
|
+
match = false
|
|
35
|
+
end
|
|
36
|
+
[match, word]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def load_dict(path)
|
|
42
|
+
begin
|
|
43
|
+
File.open(path, "rb") {|io| Marshal.load(io) }
|
|
44
|
+
rescue => e
|
|
45
|
+
puts e
|
|
46
|
+
exit
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module HarmoniousDictionary
|
|
2
|
+
module RsegEngine
|
|
3
|
+
class Engine
|
|
4
|
+
def initialize
|
|
5
|
+
@running = true
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def stop
|
|
9
|
+
@running = false
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def run
|
|
13
|
+
@running = true
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def running?
|
|
17
|
+
@running
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
|
|
3
|
+
module HarmoniousDictionary
|
|
4
|
+
module RsegEngine
|
|
5
|
+
LETTER_SYMBOLS = Set.new ('a'..'z').to_a + ('A'..'Z').to_a
|
|
6
|
+
|
|
7
|
+
class English < Engine
|
|
8
|
+
def initialize
|
|
9
|
+
@word = ''
|
|
10
|
+
super
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def process(char)
|
|
14
|
+
match = false
|
|
15
|
+
word = nil
|
|
16
|
+
|
|
17
|
+
if LETTER_SYMBOLS.include?(char)
|
|
18
|
+
@word << char
|
|
19
|
+
match = true
|
|
20
|
+
else
|
|
21
|
+
word = @word
|
|
22
|
+
@word = ''
|
|
23
|
+
match = false
|
|
24
|
+
end
|
|
25
|
+
[match, word]
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module RsegFilter
|
|
6
|
+
class Conjunction
|
|
7
|
+
@@conjunctions = Set.new %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
|
|
8
|
+
|
|
9
|
+
def self.filter(char)
|
|
10
|
+
@@conjunctions.include?(char) ? :conjunction : char
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
module RsegFilter
|
|
4
|
+
class Fullwidth
|
|
5
|
+
@@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
|
|
6
|
+
'9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
|
|
7
|
+
'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
|
|
8
|
+
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
|
|
9
|
+
'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
|
|
10
|
+
'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
|
|
11
|
+
'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
|
|
12
|
+
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
|
|
13
|
+
'—' => '-', ',' => ',', '/' => '/', '·' => '.'}
|
|
14
|
+
|
|
15
|
+
class << self
|
|
16
|
+
def filter(char)
|
|
17
|
+
@@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module RsegFilter
|
|
6
|
+
class Symbol
|
|
7
|
+
@@separators = Set.new ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
|
|
8
|
+
'《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
|
|
9
|
+
'~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
|
|
10
|
+
'【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
|
|
11
|
+
'&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
|
|
12
|
+
'|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
|
|
13
|
+
' ', '-', '/', '+', ',', ' ']
|
|
14
|
+
def self.filter(char)
|
|
15
|
+
@@separators.include?(char) ? :symbol : char
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
module HarmoniousDictionary
|
|
4
|
+
module ModelAdditions
|
|
5
|
+
def validate_harmonious_of(attr_names, option = {})
|
|
6
|
+
configuration = {message:'不能含有敏感词'}
|
|
7
|
+
library = option.values[0].to_s
|
|
8
|
+
configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
|
|
9
|
+
validates_each attr_names do |model, attribute, value|
|
|
10
|
+
unless value.blank?
|
|
11
|
+
if option.size == 0
|
|
12
|
+
model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value)
|
|
13
|
+
else
|
|
14
|
+
model.errors.add(attribute, configuration[:message]) unless HarmoniousDictionary.clean?(value, library)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module HarmoniousDictionary
|
|
2
|
+
class Railtie < Rails::Railtie
|
|
3
|
+
# config.harmonious_dictionary = ActiveSupport::OrderedOptions.new
|
|
4
|
+
# config.harmonious_dictionary.use_remote_server = false
|
|
5
|
+
# config.after_initialize do
|
|
6
|
+
# if config.harmonious_dictionary.use_remote_server
|
|
7
|
+
# Rseg.instance.load_remote_url_config
|
|
8
|
+
# end
|
|
9
|
+
# end
|
|
10
|
+
|
|
11
|
+
rake_tasks do
|
|
12
|
+
load "tasks/generate_dictionary.rake"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
initializer 'HarmoniousDictionary.model_additions' do
|
|
16
|
+
ActiveSupport.on_load :active_record do
|
|
17
|
+
extend ModelAdditions
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
require 'singleton'
|
|
4
|
+
require 'net/http'
|
|
5
|
+
require 'yaml'
|
|
6
|
+
|
|
7
|
+
require File.join(File.dirname(__FILE__), 'engines/engine')
|
|
8
|
+
require File.join(File.dirname(__FILE__), 'engines/dict')
|
|
9
|
+
require File.join(File.dirname(__FILE__), 'engines/english')
|
|
10
|
+
|
|
11
|
+
require File.join(File.dirname(__FILE__), 'filters/fullwidth')
|
|
12
|
+
require File.join(File.dirname(__FILE__), 'filters/symbol')
|
|
13
|
+
require File.join(File.dirname(__FILE__), 'filters/conjunction')
|
|
14
|
+
|
|
15
|
+
module HarmoniousDictionary
|
|
16
|
+
class Rseg
|
|
17
|
+
include Singleton
|
|
18
|
+
include RsegEngine
|
|
19
|
+
include RsegFilter
|
|
20
|
+
attr_writer :input
|
|
21
|
+
attr_accessor :model
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
def segment(input, model)
|
|
25
|
+
HarmoniousDictionary::Rseg.instance.model = model
|
|
26
|
+
HarmoniousDictionary::Rseg.instance.input = input
|
|
27
|
+
HarmoniousDictionary::Rseg.instance.segment
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def load(dict)
|
|
31
|
+
HarmoniousDictionary::Rseg.instance
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def remote_segment(input)
|
|
36
|
+
begin
|
|
37
|
+
response = Net::HTTP.post_form(URI.parse("http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg"), :input => input)
|
|
38
|
+
response.code == '200' ? response.body.split(' ') :
|
|
39
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
|
40
|
+
rescue
|
|
41
|
+
["Can't connect to http://#{HarmoniousDictionary::Rseg.instance.remote_url}/seg\nUse rseg_server to start it"]
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def initialize
|
|
47
|
+
@input = ''
|
|
48
|
+
@words = []
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def remote_url
|
|
52
|
+
@remote_url ||= load_remote_url_config
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def segment
|
|
56
|
+
init_operate
|
|
57
|
+
@words = []
|
|
58
|
+
@input.chars.each do |origin|
|
|
59
|
+
char = filter(origin)
|
|
60
|
+
process(char, origin)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
process(:symbol, '')
|
|
64
|
+
@words
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def init_operate
|
|
70
|
+
@chinese_dictionary_path = chinese_dictionary_path
|
|
71
|
+
init_engines
|
|
72
|
+
init_filters
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def filter(char)
|
|
76
|
+
result = char
|
|
77
|
+
@filters.each do |klass|
|
|
78
|
+
result = klass.filter(result)
|
|
79
|
+
end
|
|
80
|
+
result
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def process(char, origin)
|
|
84
|
+
nomatch = true
|
|
85
|
+
word = ''
|
|
86
|
+
@english_dictionary ||= load_english_dictionary(english_yaml_path)
|
|
87
|
+
|
|
88
|
+
engines.each do |engine|
|
|
89
|
+
next unless engine.running?
|
|
90
|
+
match, word = engine.process(char)
|
|
91
|
+
if match
|
|
92
|
+
nomatch = false
|
|
93
|
+
else
|
|
94
|
+
word = '' if engine.class == English && !@english_dictionary.include?(word)
|
|
95
|
+
engine.stop
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
if nomatch
|
|
100
|
+
if word == ''
|
|
101
|
+
# 没切出来的就当正常的词,不输出
|
|
102
|
+
# @words << origin unless char == :symbol
|
|
103
|
+
reset_engines
|
|
104
|
+
else
|
|
105
|
+
reset_engines
|
|
106
|
+
@words << word if word.is_a?(String) if word.size >= 2
|
|
107
|
+
# 我们只需要脏词完全匹配,不需要检查下文
|
|
108
|
+
# reprocess(word) if word.is_a?(Array)
|
|
109
|
+
# re-process current char
|
|
110
|
+
process(char, origin)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def reprocess(word)
|
|
116
|
+
last = word.pop
|
|
117
|
+
|
|
118
|
+
word.each do |char|
|
|
119
|
+
process(char, char)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
process(:symbol, :symbol) # 把词加进来
|
|
123
|
+
process(last, last) # 继续分析词的最后一个字符
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def reset_engines
|
|
127
|
+
engines.each do |engine|
|
|
128
|
+
engine.run
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def engines=(engines)
|
|
133
|
+
@engines ||= engines
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def engines
|
|
137
|
+
@engines
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def init_filters
|
|
141
|
+
@filters = [Fullwidth, Symbol]
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def init_engines
|
|
145
|
+
@engines ||= [Dict, English].map do |engine_klass|
|
|
146
|
+
if engine_klass == Dict
|
|
147
|
+
engine_klass.new do
|
|
148
|
+
@dict_path = @chinese_dictionary_path
|
|
149
|
+
end
|
|
150
|
+
else
|
|
151
|
+
engine_klass.new
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def load_remote_url_config
|
|
157
|
+
YAML.load(File.read(File.join(Rails.root, 'config','harmonious_dictionary','remote_server.yml')))[Rails.env]['url']
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def load_english_dictionary(path)
|
|
161
|
+
begin
|
|
162
|
+
YAML.load(File.read(path))
|
|
163
|
+
rescue => e
|
|
164
|
+
puts e
|
|
165
|
+
exit
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def english_yaml_path
|
|
170
|
+
if model.nil?
|
|
171
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
|
|
172
|
+
else
|
|
173
|
+
File.join(Rails.root, 'config','harmonious_dictionary',"#{model}_harmonious_english.yml")
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def chinese_dictionary_path
|
|
178
|
+
if model.nil?
|
|
179
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
|
|
180
|
+
else
|
|
181
|
+
File.join(Rails.root, 'config','harmonious_dictionary',"#{model}_harmonious.hash")
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
require "harmonious_dictionary/rseg"
|
|
4
|
+
require "harmonious_dictionary/version"
|
|
5
|
+
require "harmonious_dictionary/model_additions"
|
|
6
|
+
require "harmonious_dictionary/railtie" if defined? Rails
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
module HarmoniousDictionary
|
|
10
|
+
def self.clean?(input, model = nil)
|
|
11
|
+
results = HarmoniousDictionary::Rseg.segment(input, model)
|
|
12
|
+
results.size > 0 ? false : true
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.clean_by_remote?(input)
|
|
16
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
|
17
|
+
results.size > 0 ? false : true
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.clean_by_remote(input)
|
|
21
|
+
results = HarmoniousDictionary::Rseg.remote_segment(input)
|
|
22
|
+
results.each do |result|
|
|
23
|
+
encode_result = result.force_encoding('utf-8')
|
|
24
|
+
input.gsub! /#{encode_result}/,self.clean_word_basic(encode_result)
|
|
25
|
+
end
|
|
26
|
+
input
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.clean(input, model = nil)
|
|
30
|
+
results = HarmoniousDictionary::Rseg.segment(input, model)
|
|
31
|
+
results.each{|result| input.gsub! /#{result}/,self.clean_word_basic(result) }
|
|
32
|
+
input
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def self.harmonious_words(input, model = nil)
|
|
36
|
+
return HarmoniousDictionary::Rseg.segment(input, model)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.clean_word_basic(word)
|
|
40
|
+
clearn_words = ""
|
|
41
|
+
word.size.times{ clearn_words << "*" }
|
|
42
|
+
clearn_words
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.chinese_harmonious
|
|
46
|
+
Rseg.instance.send(:engines).first.dictionary
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'yaml'
|
|
2
|
+
|
|
3
|
+
namespace :harmonious_dictionary do
|
|
4
|
+
desc "generate harmonious dictionary for use"
|
|
5
|
+
task :generate => :environment do
|
|
6
|
+
chinese_dictionary_path = File.join(Rails.root, 'config','harmonious_dictionary','chinese_dictionary.txt')
|
|
7
|
+
english_dictionary_path = File.join(Rails.root, 'config','harmonious_dictionary','english_dictionary.txt')
|
|
8
|
+
|
|
9
|
+
puts "Processing chinese words..."
|
|
10
|
+
tree = {}
|
|
11
|
+
model = ENV['model']
|
|
12
|
+
process(chinese_dictionary_path, tree)
|
|
13
|
+
File.open(hash_path(model), "wb") {|io| Marshal.dump(tree, io)}
|
|
14
|
+
puts 'Done'
|
|
15
|
+
|
|
16
|
+
puts 'Processing english words...'
|
|
17
|
+
english_dictionary = []
|
|
18
|
+
process_english_words(english_dictionary_path,english_dictionary)
|
|
19
|
+
File.open(yaml_path, "wb") {|io| YAML::dump(english_dictionary, io)}
|
|
20
|
+
puts 'Done'
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def process_english_words(path,list)
|
|
25
|
+
File.open(path, 'r') do |file|
|
|
26
|
+
file.each_line{|line| list << line.gsub!("\n",'') }
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def process(path, tree)
|
|
31
|
+
File.open(path, 'r') do |file|
|
|
32
|
+
file.each_line do |line|
|
|
33
|
+
node = nil
|
|
34
|
+
line.chars.each do |c|
|
|
35
|
+
next if c == "\n" || c == "\r"
|
|
36
|
+
if node
|
|
37
|
+
node[c] ||= {}
|
|
38
|
+
node = node[c]
|
|
39
|
+
else
|
|
40
|
+
tree[c] ||= Hash.new
|
|
41
|
+
node = tree[c]
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
node[:end] = true
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def hash_path(model = nil)
|
|
50
|
+
if model
|
|
51
|
+
File.join(Rails.root, 'config','harmonious_dictionary',"#{model}_harmonious.hash")
|
|
52
|
+
else
|
|
53
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def yaml_path(model = nil)
|
|
58
|
+
if model
|
|
59
|
+
File.join(Rails.root, 'config','harmonious_dictionary',"#{model}_harmonious_english.yml")
|
|
60
|
+
else
|
|
61
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
describe HarmoniousDictionary do
|
|
6
|
+
|
|
7
|
+
describe 'local' do
|
|
8
|
+
describe 'segment' do
|
|
9
|
+
it 'should return harmonious word for sentence' do
|
|
10
|
+
HarmoniousDictionary.harmonious_words('戴秉国在中国').should == ['戴秉国']
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'should return harmonious word for sentence under have model param' do
|
|
14
|
+
HarmoniousDictionary.harmonious_words('戴秉国在中国', 'user').should == ['戴秉国']
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'should return english,url and chinese words' do
|
|
18
|
+
HarmoniousDictionary.harmonious_words('戴秉国 in china,watch cctv.com.let fuck it','user').should == ['戴秉国','fuck']
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'should return english words under have model param' do
|
|
22
|
+
HarmoniousDictionary.harmonious_words('Gruepin','user').should == ['Gruepin']
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'should find harmonious chinese words' do
|
|
28
|
+
HarmoniousDictionary.clean?('李源潮在中国').should == false
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'should find harmonious chinese words under have model param' do
|
|
32
|
+
HarmoniousDictionary.clean?('李源潮在中国', 'user').should == false
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'should pass good words' do
|
|
36
|
+
HarmoniousDictionary.clean?('过去').should == true
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'should pass good words under have model param' do
|
|
40
|
+
HarmoniousDictionary.clean?('过去', 'user').should == true
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'should clean sentence by replace harmonious words by *' do
|
|
44
|
+
HarmoniousDictionary.clean('戴秉国在中国').should == '***在中国'
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it 'should clean sentence by replace harmonious words by * under model param' do
|
|
48
|
+
HarmoniousDictionary.clean('戴秉国在中国', 'user').should == '***在中国'
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it 'should replace harmonious with *' do
|
|
52
|
+
HarmoniousDictionary.clean_word_basic('大米').should == '**'
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# describe 'use remote' do
|
|
57
|
+
# it 'should use remote server for segment' do
|
|
58
|
+
# HarmoniousDictionary.clean_by_remote('戴秉国在中国').should == '***在中国'
|
|
59
|
+
# end
|
|
60
|
+
# end
|
|
61
|
+
end
|