sensitive_words 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +53 -0
- data/dictionary/dict1.txt +1511 -0
- data/dictionary/dict2.txt +66 -0
- data/lib/sensitive_words.rb +131 -0
- data/sensitive_words.gemspec +21 -0
- data/test.rb +17 -0
- metadata +50 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
6-4tianwang
|
|
2
|
+
89-64cdjp
|
|
3
|
+
ADMIN
|
|
4
|
+
Administrator
|
|
5
|
+
asshole
|
|
6
|
+
BLOWJOB
|
|
7
|
+
chinaliberal
|
|
8
|
+
chinamz
|
|
9
|
+
chinesenewsnet
|
|
10
|
+
Clockgemstone
|
|
11
|
+
creaders
|
|
12
|
+
Crestbone
|
|
13
|
+
dajiyuan
|
|
14
|
+
dfdz
|
|
15
|
+
DICK
|
|
16
|
+
falun
|
|
17
|
+
falundafa
|
|
18
|
+
Feelmistone
|
|
19
|
+
freechina
|
|
20
|
+
freenet
|
|
21
|
+
fuck
|
|
22
|
+
gcd
|
|
23
|
+
Gruepin
|
|
24
|
+
Guichuideng
|
|
25
|
+
HACKING
|
|
26
|
+
hongzhi
|
|
27
|
+
hrichina
|
|
28
|
+
HUANET
|
|
29
|
+
hypermart.net
|
|
30
|
+
incest
|
|
31
|
+
jiangdongriji
|
|
32
|
+
jiaochuang
|
|
33
|
+
jiaochun
|
|
34
|
+
KEFU
|
|
35
|
+
KISSMYASS
|
|
36
|
+
lihongzhi
|
|
37
|
+
minghui
|
|
38
|
+
minghuinews
|
|
39
|
+
nacb
|
|
40
|
+
Neckromancer
|
|
41
|
+
NMIS
|
|
42
|
+
PAPER64
|
|
43
|
+
penis
|
|
44
|
+
qiangjian
|
|
45
|
+
renminbao
|
|
46
|
+
renmingbao
|
|
47
|
+
SHIT
|
|
48
|
+
SUCKPENIS
|
|
49
|
+
taip
|
|
50
|
+
tibetalk
|
|
51
|
+
triangle
|
|
52
|
+
triangleboy
|
|
53
|
+
Tringel
|
|
54
|
+
UltraSurf
|
|
55
|
+
ustibet
|
|
56
|
+
voachinese
|
|
57
|
+
wangce
|
|
58
|
+
WEBZEN
|
|
59
|
+
wstaiji
|
|
60
|
+
xinsheng
|
|
61
|
+
YUMING
|
|
62
|
+
zangdu
|
|
63
|
+
ZHENGJIAN
|
|
64
|
+
ZHENGJIANWANG
|
|
65
|
+
ZHENSHANREN
|
|
66
|
+
zhuanfalun
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
class SensitiveWords
|
|
4
|
+
|
|
5
|
+
@@dict = {}
|
|
6
|
+
|
|
7
|
+
class << self
|
|
8
|
+
|
|
9
|
+
def load_dict(dict_path)
|
|
10
|
+
new_dict = get_dict_file_hash(dict_path)
|
|
11
|
+
dict = @@dict.merge new_dict
|
|
12
|
+
@@dict = dict
|
|
13
|
+
rescue Errno::ENOENT => boom
|
|
14
|
+
puts "#{boom.class} - #{boom.message}"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def get_dict_file_hash(path)
|
|
18
|
+
tree = {}
|
|
19
|
+
file = File.open(path, 'r')
|
|
20
|
+
if file
|
|
21
|
+
file.each_line do |line|
|
|
22
|
+
line = line.chomp
|
|
23
|
+
next if line.empty?
|
|
24
|
+
node = nil
|
|
25
|
+
line.chars.each do |c|
|
|
26
|
+
if node
|
|
27
|
+
node[c] ||= {}
|
|
28
|
+
node = node[c]
|
|
29
|
+
else
|
|
30
|
+
tree[c] ||= {}
|
|
31
|
+
node = tree[c]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
node[:end] = :id
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
tree
|
|
38
|
+
ensure
|
|
39
|
+
file.close if file
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def sensitive_words(input,max=nil)
|
|
43
|
+
ins = SensitiveWords.new(input)
|
|
44
|
+
max = max.to_i
|
|
45
|
+
if max > 0
|
|
46
|
+
ins.sensitive_words(max)
|
|
47
|
+
else
|
|
48
|
+
ins.all_sensitive_words
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def initialize(input)
|
|
55
|
+
@input = input
|
|
56
|
+
@words = []
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
#只要有限个敏感词
|
|
60
|
+
def sensitive_words(max)
|
|
61
|
+
@node, @words = @@dict, []
|
|
62
|
+
@word, @queue = '', []
|
|
63
|
+
|
|
64
|
+
@input.chars.each do |char|
|
|
65
|
+
break if @words.size >= max
|
|
66
|
+
loop do
|
|
67
|
+
break if @queue.empty?
|
|
68
|
+
chr = @queue.shift
|
|
69
|
+
process_check(chr, true)
|
|
70
|
+
end
|
|
71
|
+
process_check(char)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
process_check('')
|
|
75
|
+
@words.first(max)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
#所有的敏感词
|
|
79
|
+
def all_sensitive_words
|
|
80
|
+
@node, @words = @@dict, []
|
|
81
|
+
@word, @queue = '', []
|
|
82
|
+
|
|
83
|
+
@input.chars.each do |char|
|
|
84
|
+
loop do
|
|
85
|
+
break if @queue.empty?
|
|
86
|
+
chr = @queue.shift
|
|
87
|
+
process_check(chr, true)
|
|
88
|
+
end
|
|
89
|
+
process_check(char)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
process_check('')
|
|
93
|
+
@words
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
def process_check(char,queuing=false)
|
|
99
|
+
|
|
100
|
+
match, word = nil, nil
|
|
101
|
+
|
|
102
|
+
if @node[char]
|
|
103
|
+
@word << char
|
|
104
|
+
@node = @node[char]
|
|
105
|
+
match = :id
|
|
106
|
+
else
|
|
107
|
+
if @node[:end]
|
|
108
|
+
word = @word
|
|
109
|
+
end
|
|
110
|
+
lth = @word.length
|
|
111
|
+
if lth > 0
|
|
112
|
+
if queuing
|
|
113
|
+
@queue.unshift char
|
|
114
|
+
else
|
|
115
|
+
if lth > 1
|
|
116
|
+
@queue += @word.chars.last(lth-1)
|
|
117
|
+
end
|
|
118
|
+
@queue << char
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
@node = @@dict
|
|
123
|
+
@word = ''
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
if !match && word
|
|
127
|
+
@words << word
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
$LOAD_PATH.unshift('lib')
|
|
3
|
+
|
|
4
|
+
version = '0.0.1'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new 'sensitive_words',version do |spec|
|
|
7
|
+
spec.authors = ["Jeffrey"]
|
|
8
|
+
spec.email = ["jeffrey6052@163.com"]
|
|
9
|
+
spec.description = "检索文章中的敏感词"
|
|
10
|
+
spec.summary = "-"
|
|
11
|
+
spec.homepage = "https://github.com/maymay25/sensitive_words"
|
|
12
|
+
spec.license = "MIT"
|
|
13
|
+
|
|
14
|
+
spec.files = ['dictionary/dict1.txt',
|
|
15
|
+
'dictionary/dict2.txt',
|
|
16
|
+
'lib/sensitive_words.rb',
|
|
17
|
+
'sensitive_words.gemspec',
|
|
18
|
+
'test.rb',
|
|
19
|
+
'README.md']
|
|
20
|
+
|
|
21
|
+
end
|
data/test.rb
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
require 'sensitive_words'
|
|
4
|
+
|
|
5
|
+
#首先载入敏感词词典
|
|
6
|
+
SensitiveWords.load_dict("#{__dir__}/dictionary/dict1.txt")
|
|
7
|
+
SensitiveWords.load_dict("#{__dir__}/dictionary/dict2.txt") #可以载入多次
|
|
8
|
+
|
|
9
|
+
article = "习近平周永暴干康BLOWJOBjeffrey哈哈哈流氓政府"
|
|
10
|
+
|
|
11
|
+
#找出文章中的所有敏感词
|
|
12
|
+
words = SensitiveWords.sensitive_words(article)
|
|
13
|
+
puts words.inspect # => ["习近平", "暴干", "BLOWJOB", "流氓政府"]
|
|
14
|
+
|
|
15
|
+
#或者只需要指定数量上限的敏感词
|
|
16
|
+
words = SensitiveWords.sensitive_words(article,2)
|
|
17
|
+
puts words.inspect # => ["习近平", "暴干"]
|
metadata
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: sensitive_words
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Jeffrey
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: "检索文章中的敏感词"
|
|
14
|
+
email:
|
|
15
|
+
- jeffrey6052@163.com
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- dictionary/dict1.txt
|
|
21
|
+
- dictionary/dict2.txt
|
|
22
|
+
- lib/sensitive_words.rb
|
|
23
|
+
- sensitive_words.gemspec
|
|
24
|
+
- test.rb
|
|
25
|
+
- README.md
|
|
26
|
+
homepage: https://github.com/maymay25/sensitive_words
|
|
27
|
+
licenses:
|
|
28
|
+
- MIT
|
|
29
|
+
metadata: {}
|
|
30
|
+
post_install_message:
|
|
31
|
+
rdoc_options: []
|
|
32
|
+
require_paths:
|
|
33
|
+
- lib
|
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
35
|
+
requirements:
|
|
36
|
+
- - ">="
|
|
37
|
+
- !ruby/object:Gem::Version
|
|
38
|
+
version: '0'
|
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: '0'
|
|
44
|
+
requirements: []
|
|
45
|
+
rubyforge_project:
|
|
46
|
+
rubygems_version: 2.0.14
|
|
47
|
+
signing_key:
|
|
48
|
+
specification_version: 4
|
|
49
|
+
summary: "-"
|
|
50
|
+
test_files: []
|