siefca-httpage 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/httpage/httpage.rb +5 -2
- data/lib/httpage.rb +1 -0
- metadata +12 -3
data/lib/httpage/httpage.rb
CHANGED
@@ -174,16 +174,19 @@ class HTTPage
|
|
174
174
|
gsub(/<.*?>/m, ''))
|
175
175
|
end
|
176
176
|
|
177
|
-
# Transliterates text to ASCII and removes unknown characters.
|
177
|
+
# Transliterates text to ASCII and removes unknown characters leaving just words.
|
178
178
|
|
179
179
|
def clean_text(text=nil, enc=nil)
|
180
180
|
text ||= self.body
|
181
181
|
enc ||= self.encoding
|
182
182
|
page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
|
183
|
-
page =
|
183
|
+
page = strip_html(page)
|
184
|
+
page.gsub!(/['`]/m, '_amp__')
|
185
|
+
page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase
|
184
186
|
page.tr!(".!?", ' ')
|
185
187
|
page.gsub!(/[^\x00-\x7F]+/, '')
|
186
188
|
page.gsub!(/[^a-z0-9\-_\+\s\n\.\!\?]+/im, '')
|
189
|
+
page.gsub!('_amp__',"'")
|
187
190
|
page.gsub!(%r{[.*?]}mi, '')
|
188
191
|
page.squeeze!(" \n")
|
189
192
|
page.gsub!(/^\s?\n\s?$/m, '')
|
data/lib/httpage.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: siefca-httpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Pawe\xC5\x82 Wilk"
|
@@ -11,8 +11,17 @@ cert_chain: []
|
|
11
11
|
|
12
12
|
date: 2009-04-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: htmlentities
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
16
25
|
description: httpage is simple HTTP(S) reader with ability to transliterate body
|
17
26
|
email: pw@gnu.org
|
18
27
|
executables: []
|