sensitive_words 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +53 -0
- data/dictionary/dict1.txt +1511 -0
- data/dictionary/dict2.txt +66 -0
- data/lib/sensitive_words.rb +131 -0
- data/sensitive_words.gemspec +21 -0
- data/test.rb +17 -0
- metadata +50 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
6-4tianwang
|
2
|
+
89-64cdjp
|
3
|
+
ADMIN
|
4
|
+
Administrator
|
5
|
+
asshole
|
6
|
+
BLOWJOB
|
7
|
+
chinaliberal
|
8
|
+
chinamz
|
9
|
+
chinesenewsnet
|
10
|
+
Clockgemstone
|
11
|
+
creaders
|
12
|
+
Crestbone
|
13
|
+
dajiyuan
|
14
|
+
dfdz
|
15
|
+
DICK
|
16
|
+
falun
|
17
|
+
falundafa
|
18
|
+
Feelmistone
|
19
|
+
freechina
|
20
|
+
freenet
|
21
|
+
fuck
|
22
|
+
gcd
|
23
|
+
Gruepin
|
24
|
+
Guichuideng
|
25
|
+
HACKING
|
26
|
+
hongzhi
|
27
|
+
hrichina
|
28
|
+
HUANET
|
29
|
+
hypermart.net
|
30
|
+
incest
|
31
|
+
jiangdongriji
|
32
|
+
jiaochuang
|
33
|
+
jiaochun
|
34
|
+
KEFU
|
35
|
+
KISSMYASS
|
36
|
+
lihongzhi
|
37
|
+
minghui
|
38
|
+
minghuinews
|
39
|
+
nacb
|
40
|
+
Neckromancer
|
41
|
+
NMIS
|
42
|
+
PAPER64
|
43
|
+
penis
|
44
|
+
qiangjian
|
45
|
+
renminbao
|
46
|
+
renmingbao
|
47
|
+
SHIT
|
48
|
+
SUCKPENIS
|
49
|
+
taip
|
50
|
+
tibetalk
|
51
|
+
triangle
|
52
|
+
triangleboy
|
53
|
+
Tringel
|
54
|
+
UltraSurf
|
55
|
+
ustibet
|
56
|
+
voachinese
|
57
|
+
wangce
|
58
|
+
WEBZEN
|
59
|
+
wstaiji
|
60
|
+
xinsheng
|
61
|
+
YUMING
|
62
|
+
zangdu
|
63
|
+
ZHENGJIAN
|
64
|
+
ZHENGJIANWANG
|
65
|
+
ZHENSHANREN
|
66
|
+
zhuanfalun
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
class SensitiveWords
|
4
|
+
|
5
|
+
@@dict = {}
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
def load_dict(dict_path)
|
10
|
+
new_dict = get_dict_file_hash(dict_path)
|
11
|
+
dict = @@dict.merge new_dict
|
12
|
+
@@dict = dict
|
13
|
+
rescue Errno::ENOENT => boom
|
14
|
+
puts "#{boom.class} - #{boom.message}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_dict_file_hash(path)
|
18
|
+
tree = {}
|
19
|
+
file = File.open(path, 'r')
|
20
|
+
if file
|
21
|
+
file.each_line do |line|
|
22
|
+
line = line.chomp
|
23
|
+
next if line.empty?
|
24
|
+
node = nil
|
25
|
+
line.chars.each do |c|
|
26
|
+
if node
|
27
|
+
node[c] ||= {}
|
28
|
+
node = node[c]
|
29
|
+
else
|
30
|
+
tree[c] ||= {}
|
31
|
+
node = tree[c]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
node[:end] = :id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
tree
|
38
|
+
ensure
|
39
|
+
file.close if file
|
40
|
+
end
|
41
|
+
|
42
|
+
def sensitive_words(input,max=nil)
|
43
|
+
ins = SensitiveWords.new(input)
|
44
|
+
max = max.to_i
|
45
|
+
if max > 0
|
46
|
+
ins.sensitive_words(max)
|
47
|
+
else
|
48
|
+
ins.all_sensitive_words
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
def initialize(input)
|
55
|
+
@input = input
|
56
|
+
@words = []
|
57
|
+
end
|
58
|
+
|
59
|
+
#只要有限个敏感词
|
60
|
+
def sensitive_words(max)
|
61
|
+
@node, @words = @@dict, []
|
62
|
+
@word, @queue = '', []
|
63
|
+
|
64
|
+
@input.chars.each do |char|
|
65
|
+
break if @words.size >= max
|
66
|
+
loop do
|
67
|
+
break if @queue.empty?
|
68
|
+
chr = @queue.shift
|
69
|
+
process_check(chr, true)
|
70
|
+
end
|
71
|
+
process_check(char)
|
72
|
+
end
|
73
|
+
|
74
|
+
process_check('')
|
75
|
+
@words.first(max)
|
76
|
+
end
|
77
|
+
|
78
|
+
#所有的敏感词
|
79
|
+
def all_sensitive_words
|
80
|
+
@node, @words = @@dict, []
|
81
|
+
@word, @queue = '', []
|
82
|
+
|
83
|
+
@input.chars.each do |char|
|
84
|
+
loop do
|
85
|
+
break if @queue.empty?
|
86
|
+
chr = @queue.shift
|
87
|
+
process_check(chr, true)
|
88
|
+
end
|
89
|
+
process_check(char)
|
90
|
+
end
|
91
|
+
|
92
|
+
process_check('')
|
93
|
+
@words
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def process_check(char,queuing=false)
|
99
|
+
|
100
|
+
match, word = nil, nil
|
101
|
+
|
102
|
+
if @node[char]
|
103
|
+
@word << char
|
104
|
+
@node = @node[char]
|
105
|
+
match = :id
|
106
|
+
else
|
107
|
+
if @node[:end]
|
108
|
+
word = @word
|
109
|
+
end
|
110
|
+
lth = @word.length
|
111
|
+
if lth > 0
|
112
|
+
if queuing
|
113
|
+
@queue.unshift char
|
114
|
+
else
|
115
|
+
if lth > 1
|
116
|
+
@queue += @word.chars.last(lth-1)
|
117
|
+
end
|
118
|
+
@queue << char
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
@node = @@dict
|
123
|
+
@word = ''
|
124
|
+
end
|
125
|
+
|
126
|
+
if !match && word
|
127
|
+
@words << word
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
$LOAD_PATH.unshift('lib')
|
3
|
+
|
4
|
+
version = '0.0.1'
|
5
|
+
|
6
|
+
Gem::Specification.new 'sensitive_words',version do |spec|
|
7
|
+
spec.authors = ["Jeffrey"]
|
8
|
+
spec.email = ["jeffrey6052@163.com"]
|
9
|
+
spec.description = "检索文章中的敏感词"
|
10
|
+
spec.summary = "-"
|
11
|
+
spec.homepage = "https://github.com/maymay25/sensitive_words"
|
12
|
+
spec.license = "MIT"
|
13
|
+
|
14
|
+
spec.files = ['dictionary/dict1.txt',
|
15
|
+
'dictionary/dict2.txt',
|
16
|
+
'lib/sensitive_words.rb',
|
17
|
+
'sensitive_words.gemspec',
|
18
|
+
'test.rb',
|
19
|
+
'README.md']
|
20
|
+
|
21
|
+
end
|
data/test.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'sensitive_words'
|
4
|
+
|
5
|
+
#首先载入敏感词词典
|
6
|
+
SensitiveWords.load_dict("#{__dir__}/dictionary/dict1.txt")
|
7
|
+
SensitiveWords.load_dict("#{__dir__}/dictionary/dict2.txt") #可以载入多次
|
8
|
+
|
9
|
+
article = "习近平周永暴干康BLOWJOBjeffrey哈哈哈流氓政府"
|
10
|
+
|
11
|
+
#找出文章中的所有敏感词
|
12
|
+
words = SensitiveWords.sensitive_words(article)
|
13
|
+
puts words.inspect # => ["习近平", "暴干", "BLOWJOB", "流氓政府"]
|
14
|
+
|
15
|
+
#或者只需要指定数量上限的敏感词
|
16
|
+
words = SensitiveWords.sensitive_words(article,2)
|
17
|
+
puts words.inspect # => ["习近平", "暴干"]
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sensitive_words
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeffrey
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: "检索文章中的敏感词"
|
14
|
+
email:
|
15
|
+
- jeffrey6052@163.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- dictionary/dict1.txt
|
21
|
+
- dictionary/dict2.txt
|
22
|
+
- lib/sensitive_words.rb
|
23
|
+
- sensitive_words.gemspec
|
24
|
+
- test.rb
|
25
|
+
- README.md
|
26
|
+
homepage: https://github.com/maymay25/sensitive_words
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.0.14
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: "-"
|
50
|
+
test_files: []
|