jakal 0.1.92 → 0.1.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jkl/text_client.rb +4 -4
- data/test/unit/text_cleaning_test.rb +10 -0
- metadata +2 -2
data/lib/jkl/text_client.rb
CHANGED
@@ -2,8 +2,8 @@ module Jkl
|
|
2
2
|
module Text
|
3
3
|
class << self
|
4
4
|
|
5
|
-
def sanitize(text)
|
6
|
-
remove_short_lines(strip_all_tags(remove_script_tags(text)))
|
5
|
+
def sanitize(text, number = 5)
|
6
|
+
remove_short_lines(strip_all_tags(remove_script_tags(text)), number)
|
7
7
|
end
|
8
8
|
alias :clean :sanitize
|
9
9
|
|
@@ -24,12 +24,12 @@ module Jkl
|
|
24
24
|
text.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i, "")
|
25
25
|
end
|
26
26
|
|
27
|
-
def remove_short_lines(text)
|
27
|
+
def remove_short_lines(text, number = 5)
|
28
28
|
text = text.gsub(/\s\s/, "\n")
|
29
29
|
str = ""
|
30
30
|
# remove short lines - ususally just navigation
|
31
31
|
text.split("\n").each do |l|
|
32
|
-
str << l unless l.count(" ") <
|
32
|
+
str << l unless l.count(" ") < number
|
33
33
|
end
|
34
34
|
str
|
35
35
|
end
|
@@ -16,6 +16,16 @@ HTML
|
|
16
16
|
assert result == "the cat sat on the mat"
|
17
17
|
end
|
18
18
|
|
19
|
+
should "Remove shorter lines" do
|
20
|
+
input = <<-HTML
|
21
|
+
the cat sat on the mat
|
22
|
+
the cat sat on the slightly fluffy, yet worn and homely mat
|
23
|
+
a short line
|
24
|
+
HTML
|
25
|
+
result = Jkl::Text::remove_short_lines(input, 8)
|
26
|
+
assert result == "the cat sat on the slightly fluffy, yet worn and homely mat"
|
27
|
+
end
|
28
|
+
|
19
29
|
should "Remove script tags" do
|
20
30
|
input = <<-HTML
|
21
31
|
the cat sat on the mat
|