siefca-httpage 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/httpage/httpage.rb +5 -2
- data/lib/httpage.rb +1 -0
- metadata +12 -3
data/lib/httpage/httpage.rb
CHANGED
@@ -174,16 +174,19 @@ class HTTPage
|
|
174
174
|
gsub(/<.*?>/m, ''))
|
175
175
|
end
|
176
176
|
|
177
|
-
# Transliterates text to ASCII and removes unknown characters.
|
177
|
+
# Transliterates text to ASCII and removes unknown characters leaving just words.
|
178
178
|
|
179
179
|
def clean_text(text=nil, enc=nil)
|
180
180
|
text ||= self.body
|
181
181
|
enc ||= self.encoding
|
182
182
|
page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
|
183
|
-
page =
|
183
|
+
page = strip_html(page)
|
184
|
+
page.gsub!(/['`]/m, '_amp__')
|
185
|
+
page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase
|
184
186
|
page.tr!(".!?", ' ')
|
185
187
|
page.gsub!(/[^\x00-\x7F]+/, '')
|
186
188
|
page.gsub!(/[^a-z0-9\-_\+\s\n\.\!\?]+/im, '')
|
189
|
+
page.gsub!('_amp__',"'")
|
187
190
|
page.gsub!(%r{[.*?]}mi, '')
|
188
191
|
page.squeeze!(" \n")
|
189
192
|
page.gsub!(/^\s?\n\s?$/m, '')
|
data/lib/httpage.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: siefca-httpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Pawe\xC5\x82 Wilk"
|
@@ -11,8 +11,17 @@ cert_chain: []
|
|
11
11
|
|
12
12
|
date: 2009-04-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: htmlentities
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
16
25
|
description: httpage is simple HTTP(S) reader with ability to transliterate body
|
17
26
|
email: pw@gnu.org
|
18
27
|
executables: []
|