jakal 0.1.92 → 0.1.93
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jkl/text_client.rb +4 -4
- data/test/unit/text_cleaning_test.rb +10 -0
- metadata +2 -2
data/lib/jkl/text_client.rb
CHANGED
@@ -2,8 +2,8 @@ module Jkl
|
|
2
2
|
module Text
|
3
3
|
class << self
|
4
4
|
|
5
|
-
def sanitize(text)
|
6
|
-
remove_short_lines(strip_all_tags(remove_script_tags(text)))
|
5
|
+
def sanitize(text, number = 5)
|
6
|
+
remove_short_lines(strip_all_tags(remove_script_tags(text)), number)
|
7
7
|
end
|
8
8
|
alias :clean :sanitize
|
9
9
|
|
@@ -24,12 +24,12 @@ module Jkl
|
|
24
24
|
text.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i, "")
|
25
25
|
end
|
26
26
|
|
27
|
-
def remove_short_lines(text)
|
27
|
+
def remove_short_lines(text, number = 5)
|
28
28
|
text = text.gsub(/\s\s/, "\n")
|
29
29
|
str = ""
|
30
30
|
# remove short lines - ususally just navigation
|
31
31
|
text.split("\n").each do |l|
|
32
|
-
str << l unless l.count(" ") <
|
32
|
+
str << l unless l.count(" ") < number
|
33
33
|
end
|
34
34
|
str
|
35
35
|
end
|
@@ -16,6 +16,16 @@ HTML
|
|
16
16
|
assert result == "the cat sat on the mat"
|
17
17
|
end
|
18
18
|
|
19
|
+
should "Remove shorter lines" do
|
20
|
+
input = <<-HTML
|
21
|
+
the cat sat on the mat
|
22
|
+
the cat sat on the slightly fluffy, yet worn and homely mat
|
23
|
+
a short line
|
24
|
+
HTML
|
25
|
+
result = Jkl::Text::remove_short_lines(input, 8)
|
26
|
+
assert result == "the cat sat on the slightly fluffy, yet worn and homely mat"
|
27
|
+
end
|
28
|
+
|
19
29
|
should "Remove script tags" do
|
20
30
|
input = <<-HTML
|
21
31
|
the cat sat on the mat
|