sensitive_words 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ 6-4tianwang
2
+ 89-64cdjp
3
+ ADMIN
4
+ Administrator
5
+ asshole
6
+ BLOWJOB
7
+ chinaliberal
8
+ chinamz
9
+ chinesenewsnet
10
+ Clockgemstone
11
+ creaders
12
+ Crestbone
13
+ dajiyuan
14
+ dfdz
15
+ DICK
16
+ falun
17
+ falundafa
18
+ Feelmistone
19
+ freechina
20
+ freenet
21
+ fuck
22
+ gcd
23
+ Gruepin
24
+ Guichuideng
25
+ HACKING
26
+ hongzhi
27
+ hrichina
28
+ HUANET
29
+ hypermart.net
30
+ incest
31
+ jiangdongriji
32
+ jiaochuang
33
+ jiaochun
34
+ KEFU
35
+ KISSMYASS
36
+ lihongzhi
37
+ minghui
38
+ minghuinews
39
+ nacb
40
+ Neckromancer
41
+ NMIS
42
+ PAPER64
43
+ penis
44
+ qiangjian
45
+ renminbao
46
+ renmingbao
47
+ SHIT
48
+ SUCKPENIS
49
+ taip
50
+ tibetalk
51
+ triangle
52
+ triangleboy
53
+ Tringel
54
+ UltraSurf
55
+ ustibet
56
+ voachinese
57
+ wangce
58
+ WEBZEN
59
+ wstaiji
60
+ xinsheng
61
+ YUMING
62
+ zangdu
63
+ ZHENGJIAN
64
+ ZHENGJIANWANG
65
+ ZHENSHANREN
66
+ zhuanfalun
@@ -0,0 +1,131 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class SensitiveWords
4
+
5
+ @@dict = {}
6
+
7
+ class << self
8
+
9
+ def load_dict(dict_path)
10
+ new_dict = get_dict_file_hash(dict_path)
11
+ dict = @@dict.merge new_dict
12
+ @@dict = dict
13
+ rescue Errno::ENOENT => boom
14
+ puts "#{boom.class} - #{boom.message}"
15
+ end
16
+
17
+ def get_dict_file_hash(path)
18
+ tree = {}
19
+ file = File.open(path, 'r')
20
+ if file
21
+ file.each_line do |line|
22
+ line = line.chomp
23
+ next if line.empty?
24
+ node = nil
25
+ line.chars.each do |c|
26
+ if node
27
+ node[c] ||= {}
28
+ node = node[c]
29
+ else
30
+ tree[c] ||= {}
31
+ node = tree[c]
32
+ end
33
+ end
34
+ node[:end] = :id
35
+ end
36
+ end
37
+ tree
38
+ ensure
39
+ file.close if file
40
+ end
41
+
42
+ def sensitive_words(input,max=nil)
43
+ ins = SensitiveWords.new(input)
44
+ max = max.to_i
45
+ if max > 0
46
+ ins.sensitive_words(max)
47
+ else
48
+ ins.all_sensitive_words
49
+ end
50
+ end
51
+
52
+ end
53
+
54
+ def initialize(input)
55
+ @input = input
56
+ @words = []
57
+ end
58
+
59
+ #只要有限个敏感词
60
+ def sensitive_words(max)
61
+ @node, @words = @@dict, []
62
+ @word, @queue = '', []
63
+
64
+ @input.chars.each do |char|
65
+ break if @words.size >= max
66
+ loop do
67
+ break if @queue.empty?
68
+ chr = @queue.shift
69
+ process_check(chr, true)
70
+ end
71
+ process_check(char)
72
+ end
73
+
74
+ process_check('')
75
+ @words.first(max)
76
+ end
77
+
78
+ #所有的敏感词
79
+ def all_sensitive_words
80
+ @node, @words = @@dict, []
81
+ @word, @queue = '', []
82
+
83
+ @input.chars.each do |char|
84
+ loop do
85
+ break if @queue.empty?
86
+ chr = @queue.shift
87
+ process_check(chr, true)
88
+ end
89
+ process_check(char)
90
+ end
91
+
92
+ process_check('')
93
+ @words
94
+ end
95
+
96
+ private
97
+
98
+ def process_check(char,queuing=false)
99
+
100
+ match, word = nil, nil
101
+
102
+ if @node[char]
103
+ @word << char
104
+ @node = @node[char]
105
+ match = :id
106
+ else
107
+ if @node[:end]
108
+ word = @word
109
+ end
110
+ lth = @word.length
111
+ if lth > 0
112
+ if queuing
113
+ @queue.unshift char
114
+ else
115
+ if lth > 1
116
+ @queue += @word.chars.last(lth-1)
117
+ end
118
+ @queue << char
119
+ end
120
+ end
121
+
122
+ @node = @@dict
123
+ @word = ''
124
+ end
125
+
126
+ if !match && word
127
+ @words << word
128
+ end
129
+ end
130
+
131
+ end
@@ -0,0 +1,21 @@
1
+ # coding: utf-8
2
+ $LOAD_PATH.unshift('lib')
3
+
4
+ version = '0.0.1'
5
+
6
+ Gem::Specification.new 'sensitive_words',version do |spec|
7
+ spec.authors = ["Jeffrey"]
8
+ spec.email = ["jeffrey6052@163.com"]
9
+ spec.description = "检索文章中的敏感词"
10
+ spec.summary = "-"
11
+ spec.homepage = "https://github.com/maymay25/sensitive_words"
12
+ spec.license = "MIT"
13
+
14
+ spec.files = ['dictionary/dict1.txt',
15
+ 'dictionary/dict2.txt',
16
+ 'lib/sensitive_words.rb',
17
+ 'sensitive_words.gemspec',
18
+ 'test.rb',
19
+ 'README.md']
20
+
21
+ end
data/test.rb ADDED
@@ -0,0 +1,17 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'sensitive_words'
4
+
5
+ #首先载入敏感词词典
6
+ SensitiveWords.load_dict("#{__dir__}/dictionary/dict1.txt")
7
+ SensitiveWords.load_dict("#{__dir__}/dictionary/dict2.txt") #可以载入多次
8
+
9
+ article = "习近平周永暴干康BLOWJOBjeffrey哈哈哈流氓政府"
10
+
11
+ #找出文章中的所有敏感词
12
+ words = SensitiveWords.sensitive_words(article)
13
+ puts words.inspect # => ["习近平", "暴干", "BLOWJOB", "流氓政府"]
14
+
15
+ #或者只需要指定数量上限的敏感词
16
+ words = SensitiveWords.sensitive_words(article,2)
17
+ puts words.inspect # => ["习近平", "暴干"]
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sensitive_words
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jeffrey
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-11 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: "检索文章中的敏感词"
14
+ email:
15
+ - jeffrey6052@163.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - dictionary/dict1.txt
21
+ - dictionary/dict2.txt
22
+ - lib/sensitive_words.rb
23
+ - sensitive_words.gemspec
24
+ - test.rb
25
+ - README.md
26
+ homepage: https://github.com/maymay25/sensitive_words
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.0.14
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: "-"
50
+ test_files: []