filter_word 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ 6-4tianwang
2
+ 89-64cdjp
3
+ ADMIN
4
+ Administrator
5
+ asshole
6
+ BLOWJOB
7
+ chinaliberal
8
+ chinamz
9
+ chinesenewsnet
10
+ Clockgemstone
11
+ creaders
12
+ Crestbone
13
+ dajiyuan
14
+ dfdz
15
+ DICK
16
+ falun
17
+ falundafa
18
+ Feelmistone
19
+ freechina
20
+ freenet
21
+ fuck
22
+ gcd
23
+ Gruepin
24
+ Guichuideng
25
+ HACKING
26
+ hongzhi
27
+ hrichina
28
+ HUANET
29
+ hypermart.net
30
+ incest
31
+ jiangdongriji
32
+ jiaochuang
33
+ jiaochun
34
+ KEFU
35
+ KISSMYASS
36
+ lihongzhi
37
+ minghui
38
+ minghuinews
39
+ nacb
40
+ Neckromancer
41
+ NMIS
42
+ PAPER64
43
+ penis
44
+ qiangjian
45
+ renminbao
46
+ renmingbao
47
+ SHIT
48
+ SUCKPENIS
49
+ taip
50
+ tibetalk
51
+ triangle
52
+ triangleboy
53
+ Tringel
54
+ UltraSurf
55
+ ustibet
56
+ voachinese
57
+ wangce
58
+ WEBZEN
59
+ wstaiji
60
+ xinsheng
61
+ YUMING
62
+ zangdu
63
+ ZHENGJIAN
64
+ ZHENGJIANWANG
65
+ ZHENSHANREN
66
+ zhuanfalun
@@ -0,0 +1,67 @@
1
+ require 'yaml'
2
+
3
+ namespace :filter_word do
4
+ desc "generate harmonious dictionary for use"
5
+ task :generate => :environment do
6
+
7
+ puts "Processing chinese words..."
8
+ tree = {}
9
+ model = ENV['model']
10
+
11
+ chinese_dictionary = model.nil? ? 'chinese_dictionary.txt' : "#{model}_chinese_dictionary.txt"
12
+ english_dictionary = model.nil? ? 'english_dictionary.txt' : "#{model}_english_dictionary.txt"
13
+ chinese_dictionary_path = File.join(Rails.root, 'config','filter_word', chinese_dictionary)
14
+ english_dictionary_path = File.join(Rails.root, 'config','filter_word', english_dictionary)
15
+
16
+ process(chinese_dictionary_path, tree)
17
+ File.open(hash_path(model), "wb") {|io| Marshal.dump(tree, io)}
18
+ puts 'chinese_dictionary hash Done'
19
+
20
+ puts 'Processing english words...'
21
+ english_dictionary_list = []
22
+ process_english_words(english_dictionary_path, english_dictionary_list)
23
+ File.open(yaml_path(model), "wb") {|io| YAML::dump(english_dictionary_list, io)}
24
+ puts 'english_dictionary yaml Done'
25
+ end
26
+ end
27
+
28
+ def process_english_words(path,list)
29
+ File.open(path, 'r') do |file|
30
+ file.each_line{|line| list << line.gsub!("\n",'') }
31
+ end
32
+ end
33
+
34
+ def process(path, tree)
35
+ File.open(path, 'r') do |file|
36
+ file.each_line do |line|
37
+ node = nil
38
+ line.chars.each do |c|
39
+ next if c == "\n" || c == "\r"
40
+ if node
41
+ node[c] ||= {}
42
+ node = node[c]
43
+ else
44
+ tree[c] ||= Hash.new
45
+ node = tree[c]
46
+ end
47
+ end
48
+ node[:end] = true
49
+ end
50
+ end
51
+ end
52
+
53
+ def hash_path(model = nil)
54
+ if model
55
+ File.join(Rails.root, 'config','filter_word',"#{model}_harmonious.hash")
56
+ else
57
+ File.join(Rails.root, 'config','filter_word','harmonious.hash')
58
+ end
59
+ end
60
+
61
+ def yaml_path(model = nil)
62
+ if model
63
+ File.join(Rails.root, 'config','filter_word',"#{model}_harmonious_english.yml")
64
+ else
65
+ File.join(Rails.root, 'config','filter_word','harmonious_english.yml')
66
+ end
67
+ end
@@ -0,0 +1,56 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe FilterWord do
6
+
7
+ describe 'local' do
8
+ describe 'segment' do
9
+ it 'should return harmonious word for sentence' do
10
+ FilterWord.harmonious_words('戴秉国在中国').should == ['戴秉国']
11
+ end
12
+
13
+ it 'should return harmonious word for sentence under have model param' do
14
+ FilterWord.harmonious_words('戴秉国在中国', 'user').should == ['戴秉国']
15
+ end
16
+
17
+ it 'should return english,url and chinese words' do
18
+ FilterWord.harmonious_words('戴秉国 in china,watch cctv.com.let fuck it','user').should == ['戴秉国','fuck']
19
+ end
20
+
21
+ it 'should return english words under have model param' do
22
+ FilterWord.harmonious_words('Gruepin','user').should == ['Gruepin']
23
+ end
24
+
25
+ end
26
+
27
+ it 'should find harmonious chinese words' do
28
+ FilterWord.clean?('李源潮在中国').should == false
29
+ end
30
+
31
+ it 'should find harmonious chinese words under have model param' do
32
+ FilterWord.clean?('李源潮在中国', 'user').should == false
33
+ end
34
+
35
+ it 'should pass good words' do
36
+ FilterWord.clean?('过去').should == true
37
+ end
38
+
39
+ it 'should pass good words under have model param' do
40
+ FilterWord.clean?('过去', 'user').should == true
41
+ end
42
+
43
+ it 'should clean sentence by replace harmonious words by *' do
44
+ FilterWord.clean('戴秉国在中国').should == '***在中国'
45
+ end
46
+
47
+ it 'should clean sentence by replace harmonious words by * under model param' do
48
+ FilterWord.clean('戴秉国在中国', 'user').should == '***在中国'
49
+ end
50
+
51
+ it 'should replace harmonious with *' do
52
+ FilterWord.clean_word_basic('大米').should == '**'
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ ActiveRecord::Base.establish_connection(:adapter => "sqlite3", :database => ":memory:")
5
+
6
+ ActiveRecord::Schema.define(:version => 1) do
7
+ create_table :posts do |t|
8
+ t.string :title
9
+ t.text :body
10
+ t.text :note
11
+ end
12
+ end
13
+
14
+ class Post < ActiveRecord::Base
15
+ extend FilterWord::ModelAdditions
16
+ validate_harmonious_of [:title,:body], model: :post
17
+ end
18
+
19
+ describe FilterWord::ModelAdditions do
20
+ let(:post) { @post = Post.create title:'戴秉国在中国',body:'戴秉国在中国',note:'戴秉国在中国' }
21
+
22
+ describe 'use local' do
23
+ it 'should validate for harmonious' do
24
+ post.errors[:title].should == ['不能含有敏感词']
25
+ end
26
+
27
+ it 'should have error on title' do
28
+ post.errors[:body].should == ['不能含有敏感词']
29
+ end
30
+
31
+ it 'should allow empty input value' do
32
+ p = Post.create body:'戴秉国在中国',note:'戴秉国在中国'
33
+ post.errors[:body].should == ['不能含有敏感词']
34
+ end
35
+
36
+ it 'should filter! any harmonious words' do
37
+ FilterWord.clean(post.body).should == '***在中国'
38
+ end
39
+ end
40
+
41
+ end
42
+
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+
3
+ require 'rails'
4
+ require 'active_model'
5
+ require 'active_record'
6
+ require 'filter_word'
7
+
8
+ RSpec.configure do |config|
9
+ config.color_enabled = true
10
+ config.formatter = 'documentation'
11
+ end
12
+
13
+ module Rails
14
+ def self.root
15
+ File.join File.dirname(__FILE__),'../'
16
+ end
17
+
18
+ def self.env
19
+ 'test'
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: filter_word
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - xiaobo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: "和谐检测用于检查输入是否包含中文或英文敏感词."
42
+ email:
43
+ - peterwillcn@gmail.com
44
+ executables:
45
+ - harmonious_rseg
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - ".rspec"
51
+ - ".travis.yml"
52
+ - CHANGELOG
53
+ - MIT-LICENSE
54
+ - README.markdown
55
+ - Rakefile
56
+ - benchmark/benchmark.rb
57
+ - benchmark/text_test_100.txt
58
+ - benchmark/text_test_1000.txt
59
+ - benchmark/text_test_10000.txt
60
+ - bin/harmonious_rseg
61
+ - filter_word.gemspec
62
+ - lib/filter_word.rb
63
+ - lib/filter_word/app.rb
64
+ - lib/filter_word/engines/dict.rb
65
+ - lib/filter_word/engines/engine.rb
66
+ - lib/filter_word/engines/english.rb
67
+ - lib/filter_word/filters/conjunction.rb
68
+ - lib/filter_word/filters/fullwidth.rb
69
+ - lib/filter_word/filters/symbol.rb
70
+ - lib/filter_word/model_additions.rb
71
+ - lib/filter_word/railtie.rb
72
+ - lib/filter_word/rseg.rb
73
+ - lib/filter_word/version.rb
74
+ - lib/generators/filter_word/setup/setup_generator.rb
75
+ - lib/generators/filter_word/setup/templates/chinese_dictionary.txt
76
+ - lib/generators/filter_word/setup/templates/english_dictionary.txt
77
+ - lib/tasks/generate_dictionary.rake
78
+ - spec/filter_word_spec.rb
79
+ - spec/model_additions_spec.rb
80
+ - spec/spec_helper.rb
81
+ homepage: https://github.com/tian-xiaobo/filter_word
82
+ licenses: []
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.4.5.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: filter any words that need to be harmonized
104
+ test_files:
105
+ - spec/filter_word_spec.rb
106
+ - spec/model_additions_spec.rb
107
+ - spec/spec_helper.rb