filter_word 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/.travis.yml +4 -0
- data/CHANGELOG +7 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +73 -0
- data/Rakefile +7 -0
- data/benchmark/benchmark.rb +39 -0
- data/benchmark/text_test_100.txt +4 -0
- data/benchmark/text_test_1000.txt +25 -0
- data/benchmark/text_test_10000.txt +219 -0
- data/bin/harmonious_rseg +11 -0
- data/filter_word.gemspec +22 -0
- data/lib/filter_word.rb +34 -0
- data/lib/filter_word/app.rb +18 -0
- data/lib/filter_word/engines/dict.rb +51 -0
- data/lib/filter_word/engines/engine.rb +21 -0
- data/lib/filter_word/engines/english.rb +29 -0
- data/lib/filter_word/filters/conjunction.rb +13 -0
- data/lib/filter_word/filters/fullwidth.rb +21 -0
- data/lib/filter_word/filters/symbol.rb +18 -0
- data/lib/filter_word/model_additions.rb +20 -0
- data/lib/filter_word/railtie.rb +15 -0
- data/lib/filter_word/rseg.rb +150 -0
- data/lib/filter_word/version.rb +3 -0
- data/lib/generators/filter_word/setup/setup_generator.rb +15 -0
- data/lib/generators/filter_word/setup/templates/chinese_dictionary.txt +1511 -0
- data/lib/generators/filter_word/setup/templates/english_dictionary.txt +66 -0
- data/lib/tasks/generate_dictionary.rake +67 -0
- data/spec/filter_word_spec.rb +56 -0
- data/spec/model_additions_spec.rb +42 -0
- data/spec/spec_helper.rb +21 -0
- metadata +107 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
6-4tianwang
|
2
|
+
89-64cdjp
|
3
|
+
ADMIN
|
4
|
+
Administrator
|
5
|
+
asshole
|
6
|
+
BLOWJOB
|
7
|
+
chinaliberal
|
8
|
+
chinamz
|
9
|
+
chinesenewsnet
|
10
|
+
Clockgemstone
|
11
|
+
creaders
|
12
|
+
Crestbone
|
13
|
+
dajiyuan
|
14
|
+
dfdz
|
15
|
+
DICK
|
16
|
+
falun
|
17
|
+
falundafa
|
18
|
+
Feelmistone
|
19
|
+
freechina
|
20
|
+
freenet
|
21
|
+
fuck
|
22
|
+
gcd
|
23
|
+
Gruepin
|
24
|
+
Guichuideng
|
25
|
+
HACKING
|
26
|
+
hongzhi
|
27
|
+
hrichina
|
28
|
+
HUANET
|
29
|
+
hypermart.net
|
30
|
+
incest
|
31
|
+
jiangdongriji
|
32
|
+
jiaochuang
|
33
|
+
jiaochun
|
34
|
+
KEFU
|
35
|
+
KISSMYASS
|
36
|
+
lihongzhi
|
37
|
+
minghui
|
38
|
+
minghuinews
|
39
|
+
nacb
|
40
|
+
Neckromancer
|
41
|
+
NMIS
|
42
|
+
PAPER64
|
43
|
+
penis
|
44
|
+
qiangjian
|
45
|
+
renminbao
|
46
|
+
renmingbao
|
47
|
+
SHIT
|
48
|
+
SUCKPENIS
|
49
|
+
taip
|
50
|
+
tibetalk
|
51
|
+
triangle
|
52
|
+
triangleboy
|
53
|
+
Tringel
|
54
|
+
UltraSurf
|
55
|
+
ustibet
|
56
|
+
voachinese
|
57
|
+
wangce
|
58
|
+
WEBZEN
|
59
|
+
wstaiji
|
60
|
+
xinsheng
|
61
|
+
YUMING
|
62
|
+
zangdu
|
63
|
+
ZHENGJIAN
|
64
|
+
ZHENGJIANWANG
|
65
|
+
ZHENSHANREN
|
66
|
+
zhuanfalun
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
namespace :filter_word do
|
4
|
+
desc "generate harmonious dictionary for use"
|
5
|
+
task :generate => :environment do
|
6
|
+
|
7
|
+
puts "Processing chinese words..."
|
8
|
+
tree = {}
|
9
|
+
model = ENV['model']
|
10
|
+
|
11
|
+
chinese_dictionary = model.nil? ? 'chinese_dictionary.txt' : "#{model}_chinese_dictionary.txt"
|
12
|
+
english_dictionary = model.nil? ? 'english_dictionary.txt' : "#{model}_english_dictionary.txt"
|
13
|
+
chinese_dictionary_path = File.join(Rails.root, 'config','filter_word', chinese_dictionary)
|
14
|
+
english_dictionary_path = File.join(Rails.root, 'config','filter_word', english_dictionary)
|
15
|
+
|
16
|
+
process(chinese_dictionary_path, tree)
|
17
|
+
File.open(hash_path(model), "wb") {|io| Marshal.dump(tree, io)}
|
18
|
+
puts 'chinese_dictionary hash Done'
|
19
|
+
|
20
|
+
puts 'Processing english words...'
|
21
|
+
english_dictionary_list = []
|
22
|
+
process_english_words(english_dictionary_path, english_dictionary_list)
|
23
|
+
File.open(yaml_path(model), "wb") {|io| YAML::dump(english_dictionary_list, io)}
|
24
|
+
puts 'english_dictionary yaml Done'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def process_english_words(path,list)
|
29
|
+
File.open(path, 'r') do |file|
|
30
|
+
file.each_line{|line| list << line.gsub!("\n",'') }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def process(path, tree)
|
35
|
+
File.open(path, 'r') do |file|
|
36
|
+
file.each_line do |line|
|
37
|
+
node = nil
|
38
|
+
line.chars.each do |c|
|
39
|
+
next if c == "\n" || c == "\r"
|
40
|
+
if node
|
41
|
+
node[c] ||= {}
|
42
|
+
node = node[c]
|
43
|
+
else
|
44
|
+
tree[c] ||= Hash.new
|
45
|
+
node = tree[c]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
node[:end] = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def hash_path(model = nil)
|
54
|
+
if model
|
55
|
+
File.join(Rails.root, 'config','filter_word',"#{model}_harmonious.hash")
|
56
|
+
else
|
57
|
+
File.join(Rails.root, 'config','filter_word','harmonious.hash')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def yaml_path(model = nil)
|
62
|
+
if model
|
63
|
+
File.join(Rails.root, 'config','filter_word',"#{model}_harmonious_english.yml")
|
64
|
+
else
|
65
|
+
File.join(Rails.root, 'config','filter_word','harmonious_english.yml')
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe FilterWord do
|
6
|
+
|
7
|
+
describe 'local' do
|
8
|
+
describe 'segment' do
|
9
|
+
it 'should return harmonious word for sentence' do
|
10
|
+
FilterWord.harmonious_words('戴秉国在中国').should == ['戴秉国']
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should return harmonious word for sentence under have model param' do
|
14
|
+
FilterWord.harmonious_words('戴秉国在中国', 'user').should == ['戴秉国']
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should return english,url and chinese words' do
|
18
|
+
FilterWord.harmonious_words('戴秉国 in china,watch cctv.com.let fuck it','user').should == ['戴秉国','fuck']
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should return english words under have model param' do
|
22
|
+
FilterWord.harmonious_words('Gruepin','user').should == ['Gruepin']
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should find harmonious chinese words' do
|
28
|
+
FilterWord.clean?('李源潮在中国').should == false
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should find harmonious chinese words under have model param' do
|
32
|
+
FilterWord.clean?('李源潮在中国', 'user').should == false
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should pass good words' do
|
36
|
+
FilterWord.clean?('过去').should == true
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should pass good words under have model param' do
|
40
|
+
FilterWord.clean?('过去', 'user').should == true
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should clean sentence by replace harmonious words by *' do
|
44
|
+
FilterWord.clean('戴秉国在中国').should == '***在中国'
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should clean sentence by replace harmonious words by * under model param' do
|
48
|
+
FilterWord.clean('戴秉国在中国', 'user').should == '***在中国'
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should replace harmonious with *' do
|
52
|
+
FilterWord.clean_word_basic('大米').should == '**'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
ActiveRecord::Base.establish_connection(:adapter => "sqlite3", :database => ":memory:")
|
5
|
+
|
6
|
+
ActiveRecord::Schema.define(:version => 1) do
|
7
|
+
create_table :posts do |t|
|
8
|
+
t.string :title
|
9
|
+
t.text :body
|
10
|
+
t.text :note
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Post < ActiveRecord::Base
|
15
|
+
extend FilterWord::ModelAdditions
|
16
|
+
validate_harmonious_of [:title,:body], model: :post
|
17
|
+
end
|
18
|
+
|
19
|
+
describe FilterWord::ModelAdditions do
|
20
|
+
let(:post) { @post = Post.create title:'戴秉国在中国',body:'戴秉国在中国',note:'戴秉国在中国' }
|
21
|
+
|
22
|
+
describe 'use local' do
|
23
|
+
it 'should validate for harmonious' do
|
24
|
+
post.errors[:title].should == ['不能含有敏感词']
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should have error on title' do
|
28
|
+
post.errors[:body].should == ['不能含有敏感词']
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should allow empty input value' do
|
32
|
+
p = Post.create body:'戴秉国在中国',note:'戴秉国在中国'
|
33
|
+
post.errors[:body].should == ['不能含有敏感词']
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should filter! any harmonious words' do
|
37
|
+
FilterWord.clean(post.body).should == '***在中国'
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'rails'
|
4
|
+
require 'active_model'
|
5
|
+
require 'active_record'
|
6
|
+
require 'filter_word'
|
7
|
+
|
8
|
+
RSpec.configure do |config|
|
9
|
+
config.color_enabled = true
|
10
|
+
config.formatter = 'documentation'
|
11
|
+
end
|
12
|
+
|
13
|
+
module Rails
|
14
|
+
def self.root
|
15
|
+
File.join File.dirname(__FILE__),'../'
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.env
|
19
|
+
'test'
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: filter_word
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- xiaobo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: "和谐检测用于检查输入是否包含中文或英文敏感词."
|
42
|
+
email:
|
43
|
+
- peterwillcn@gmail.com
|
44
|
+
executables:
|
45
|
+
- harmonious_rseg
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".gitignore"
|
50
|
+
- ".rspec"
|
51
|
+
- ".travis.yml"
|
52
|
+
- CHANGELOG
|
53
|
+
- MIT-LICENSE
|
54
|
+
- README.markdown
|
55
|
+
- Rakefile
|
56
|
+
- benchmark/benchmark.rb
|
57
|
+
- benchmark/text_test_100.txt
|
58
|
+
- benchmark/text_test_1000.txt
|
59
|
+
- benchmark/text_test_10000.txt
|
60
|
+
- bin/harmonious_rseg
|
61
|
+
- filter_word.gemspec
|
62
|
+
- lib/filter_word.rb
|
63
|
+
- lib/filter_word/app.rb
|
64
|
+
- lib/filter_word/engines/dict.rb
|
65
|
+
- lib/filter_word/engines/engine.rb
|
66
|
+
- lib/filter_word/engines/english.rb
|
67
|
+
- lib/filter_word/filters/conjunction.rb
|
68
|
+
- lib/filter_word/filters/fullwidth.rb
|
69
|
+
- lib/filter_word/filters/symbol.rb
|
70
|
+
- lib/filter_word/model_additions.rb
|
71
|
+
- lib/filter_word/railtie.rb
|
72
|
+
- lib/filter_word/rseg.rb
|
73
|
+
- lib/filter_word/version.rb
|
74
|
+
- lib/generators/filter_word/setup/setup_generator.rb
|
75
|
+
- lib/generators/filter_word/setup/templates/chinese_dictionary.txt
|
76
|
+
- lib/generators/filter_word/setup/templates/english_dictionary.txt
|
77
|
+
- lib/tasks/generate_dictionary.rake
|
78
|
+
- spec/filter_word_spec.rb
|
79
|
+
- spec/model_additions_spec.rb
|
80
|
+
- spec/spec_helper.rb
|
81
|
+
homepage: https://github.com/tian-xiaobo/filter_word
|
82
|
+
licenses: []
|
83
|
+
metadata: {}
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
require_paths:
|
87
|
+
- lib
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
requirements: []
|
99
|
+
rubyforge_project:
|
100
|
+
rubygems_version: 2.4.5.1
|
101
|
+
signing_key:
|
102
|
+
specification_version: 4
|
103
|
+
summary: filter any words that need to be harmonized
|
104
|
+
test_files:
|
105
|
+
- spec/filter_word_spec.rb
|
106
|
+
- spec/model_additions_spec.rb
|
107
|
+
- spec/spec_helper.rb
|