word_filter 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/word_filter/version.rb +1 -1
- data/lib/word_filter.rb +179 -0
- data/word_filter.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 06e6a40537fb1cfad2ee83e8a598c8826bd40fda
|
4
|
+
data.tar.gz: 14efc8f39dc0a84df54e2195799ea65214134d32
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 921118f7c180ba70f95b4df31297b14351f1777a0dcfca5edaa4d2b7e77be84045880f0edc5b303c3db3059a32e07f098f6eba1d79ee607bbeb5a0fe7224ac62
|
7
|
+
data.tar.gz: 6d79bc70cb2d79ea499faa05533dfeccabbecafde561db16b4045085b0828d54b2ab902164a0bff8678bb519acc5e0b1921ea90b7d9dabea28fec60d4eb28df7
|
data/lib/word_filter/version.rb
CHANGED
data/lib/word_filter.rb
CHANGED
@@ -1,4 +1,183 @@
|
|
1
1
|
require "word_filter/version"
|
2
2
|
|
3
3
|
module WordFilter
|
4
|
+
class Filter
|
5
|
+
|
6
|
+
@@emailRegex = /[a-zA-Z0-9._%+-]+@[a-z0-9.-]+\\.[a-zA-Z]{2,4}/
|
7
|
+
@@alphaNumericDigit = /(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|\d)/;
|
8
|
+
@@digitsRegex = Regexp.new("\b(\s*" + @@alphaNumericDigit.source + ")+\b")
|
9
|
+
@@streetNameRegex = Regexp.new("\b(\s*" + @@alphaNumericDigit.source + ")+\s([a-z\d]+\.?\s*){1,5}\b(avenue|ave|street|st|court|ct|circle|boulevard|blvd|lane|ln|trail|tr|loop|lp|route|rt|drive|dr|road|rd|terrace|tr|way|wy|highway|hiway|hw)\b")
|
10
|
+
@@phoneNumber = Regexp.new("((" + @@alphaNumericDigit.source + ")\W*?){3}((" + @@alphaNumericDigit.source + ")\W*?){4}\b")
|
11
|
+
@@urlRegex = /(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/
|
12
|
+
|
13
|
+
NONE = 0;
|
14
|
+
REPEATED_VOWELS = 1;
|
15
|
+
SWAPPABLE_VOWELS = 2;
|
16
|
+
SWAPPABLE_AND_REPEATED_VOWELS = 3;
|
17
|
+
SWAPPABLE_AND_REPEATED_VOWELS_INCLUDING_NONE = 4;
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
def initialize()
|
22
|
+
@filterLevel = NONE
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :filterLevel
|
26
|
+
|
27
|
+
def filterInit(dictionaryFile, badwordslist)
|
28
|
+
@goodWords = loadDictionary(dictionaryFile)
|
29
|
+
#The original java class requiere other three word's list
|
30
|
+
@datingWordsRegex = /dating/
|
31
|
+
@deviantWordsRegex = /deviant/
|
32
|
+
@badWordsRegex = loadBadwords(badwordslist)
|
33
|
+
|
34
|
+
vowels = /([aeiou])/
|
35
|
+
@vowelSwappedAndRepeatedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]+"))
|
36
|
+
@vowelSwappedAndRepeatedRegexIncludingEmpty = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]*"))
|
37
|
+
@vowelRepeatedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "\\1+"))
|
38
|
+
@vowelSwappedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]"))
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def loadDictionary(path)
|
44
|
+
words = []
|
45
|
+
File.open(path, "r").each_line do |line|
|
46
|
+
splitted = line.split(" ")
|
47
|
+
splitted.each do |w|
|
48
|
+
words << w
|
49
|
+
end
|
50
|
+
end
|
51
|
+
return words
|
52
|
+
end
|
53
|
+
|
54
|
+
def loadBadwords(path)
|
55
|
+
words = File.read(path).gsub("\r", '').split("\n")
|
56
|
+
regex = words.join('|')
|
57
|
+
regex = '(' + regex + ')'
|
58
|
+
regex = Regexp.new(regex)
|
59
|
+
return regex
|
60
|
+
end
|
61
|
+
|
62
|
+
def filterString(input)
|
63
|
+
|
64
|
+
# Output:
|
65
|
+
# -1: An exception occured while trying to check the string, do not post
|
66
|
+
# 0: string is safe to post
|
67
|
+
# 1: string contains an email address
|
68
|
+
# 2: string contains a URL
|
69
|
+
# 3: string contains a street address
|
70
|
+
# 4: string contains a phone number
|
71
|
+
# 5: string contains a dating word
|
72
|
+
# 6: string contains a deviant word
|
73
|
+
# 9: string contains any other bad word
|
74
|
+
|
75
|
+
input = input.strip.downcase
|
76
|
+
workingCopy = input
|
77
|
+
|
78
|
+
if input == ""
|
79
|
+
return 0
|
80
|
+
end
|
81
|
+
|
82
|
+
if @@emailRegex.match(input)
|
83
|
+
return 1
|
84
|
+
end
|
85
|
+
|
86
|
+
if @@urlRegex.match(input)
|
87
|
+
return 2
|
88
|
+
end
|
89
|
+
|
90
|
+
if @@streetNameRegex.match(input)
|
91
|
+
return 3
|
92
|
+
end
|
93
|
+
|
94
|
+
if @@phoneNumber.match(input)
|
95
|
+
return 4
|
96
|
+
end
|
97
|
+
|
98
|
+
workingCopy.gsub("\s+", " ")
|
99
|
+
workingCopy.gsub!(/["',.;:?-]/, " ")
|
100
|
+
workingCopy.gsub!(/!+\s/, " ")
|
101
|
+
workingCopy.gsub!(/!+\z/, " ")
|
102
|
+
workingCopy.gsub!(/\br\su/, " ")
|
103
|
+
|
104
|
+
cleanVersion = stripGoodWords(workingCopy)
|
105
|
+
|
106
|
+
if cleanVersion == nil or cleanVersion.length == 0
|
107
|
+
return 0
|
108
|
+
end
|
109
|
+
|
110
|
+
if @datingWordsRegex.match(cleanVersion)
|
111
|
+
return 5
|
112
|
+
end
|
113
|
+
|
114
|
+
if @deviantWordsRegex.match(cleanVersion)
|
115
|
+
return 6
|
116
|
+
end
|
117
|
+
|
118
|
+
if @badWordsRegex.match(cleanVersion)
|
119
|
+
return 7
|
120
|
+
end
|
121
|
+
|
122
|
+
#let's try various combinations of bad word tricks
|
123
|
+
currentVersion = cleanVersion
|
124
|
+
|
125
|
+
#compress the string then check it again
|
126
|
+
if @badWordsRegex.match(currentVersion.gsub("[ \t\n\f\r]", ""))
|
127
|
+
return 9
|
128
|
+
end
|
129
|
+
|
130
|
+
#zap special characters and check it again
|
131
|
+
if @badWordsRegex.match(currentVersion.gsub("[^a-z]", ""))
|
132
|
+
return 9
|
133
|
+
end
|
134
|
+
|
135
|
+
#replace certain special characters with their letter equivalents
|
136
|
+
#NOTE: This one maps vertical non-letter chars (!1|) to i
|
137
|
+
specialCharsReplaced_i = currentVersion.tr("@683!1|0$+","abbeiiiost")
|
138
|
+
if @badWordsRegex.match(specialCharsReplaced_i)
|
139
|
+
return 9
|
140
|
+
end
|
141
|
+
|
142
|
+
#replace certain special characters with their letter equivalents
|
143
|
+
#NOTE: This one maps vertical non-letter chars (!1|) to l
|
144
|
+
specialCharsReplaced_l = currentVersion.tr("@683!1|0$+","abbelllost")
|
145
|
+
if @badWordsRegex.match(specialCharsReplaced_l)
|
146
|
+
return 9
|
147
|
+
end
|
148
|
+
|
149
|
+
case @filterLevel
|
150
|
+
when NONE
|
151
|
+
return 0
|
152
|
+
when REPEATED_VOWELS
|
153
|
+
if @vowelRepeatedRegex.match(specialCharsReplaced_i) or @vowelRepeatedRegex.match(specialCharsReplaced_l)
|
154
|
+
return 9
|
155
|
+
end
|
156
|
+
when SWAPPABLE_VOWELS
|
157
|
+
if @vowelSwappedRegex.match(specialCharsReplaced_i) or @vowelSwappedRegex.match(specialCharsReplaced_l)
|
158
|
+
return 9
|
159
|
+
end
|
160
|
+
when SWAPPABLE_AND_REPEATED_VOWELS
|
161
|
+
if @vowelSwappedAndRepeatedRegex.match(specialCharsReplaced_i) or @vowelSwappedAndRepeatedRegex.match(specialCharsReplaced_l)
|
162
|
+
return 9
|
163
|
+
end
|
164
|
+
when SWAPPABLE_AND_REPEATED_VOWELS_INCLUDING_NONE
|
165
|
+
if @vowelSwappedAndRepeatedRegexIncludingEmpty.match(specialCharsReplaced_i) or @vowelSwappedAndRepeatedRegexIncludingEmpty.match(specialCharsReplaced_l)
|
166
|
+
return 9
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
def stripGoodWords(input)
|
173
|
+
result = []
|
174
|
+
input = input.split(" ")
|
175
|
+
input.each do |w|
|
176
|
+
if not @goodWords.include? w
|
177
|
+
result << w
|
178
|
+
end
|
179
|
+
end
|
180
|
+
return result.join(" ")
|
181
|
+
end
|
182
|
+
end
|
4
183
|
end
|
data/word_filter.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'word_filter/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "word_filter"
|
8
8
|
spec.version = WordFilter::VERSION
|
9
|
-
spec.authors = ["Huascar
|
9
|
+
spec.authors = ["Huascar Ona"]
|
10
10
|
spec.email = ["huascarking@hotmail.com"]
|
11
11
|
spec.description = %q{A bad word filter for the input text.}
|
12
12
|
spec.summary = %q{A word filter gem}
|