BoilerpipeArticle 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/boilerpipe_article.rb +17 -7
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86e9801c9b5e6eacf758aa4bbff2ec298313e163
|
4
|
+
data.tar.gz: 58bdb8a6a7624e08f5f28f8bbbc094da2a3fc02a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f8dcba151737a254fb7f0840d24d187a757575ed87c8b0817ffb24dd3d3e3b28bebacf7ac4ee980bf8bc085317c9f3ae94a651c7c5b77fee43511f1a615077ab
|
7
|
+
data.tar.gz: e3a3ed339a688ae3ed778cd4f4c2e2e426ea070b94e07dcc156d945731b38961540f7a46fa4d421dbdac21aa47022c70df6586f6458bf4f620b95e3628278cd5
|
data/lib/boilerpipe_article.rb
CHANGED
@@ -5,13 +5,13 @@ class BoilerpipeArticle
|
|
5
5
|
@html = html
|
6
6
|
end
|
7
7
|
|
8
|
-
def
|
9
|
-
html = Nokogiri::HTML.parse(
|
8
|
+
def getText(html = @html)
|
9
|
+
html = Nokogiri::HTML.parse(html).to_s
|
10
10
|
html.gsub!(/<!-[\s\S]*?->/, '')
|
11
11
|
html.gsub!(/\r?\n|\r/, '')
|
12
12
|
|
13
13
|
doc = Nokogiri::HTML(html)
|
14
|
-
badHtmlTags = ['
|
14
|
+
badHtmlTags = ['nav','head','script','style','a','img']
|
15
15
|
badHtmlTags.each do |tag|
|
16
16
|
doc.search(tag).each do |src|
|
17
17
|
src.remove
|
@@ -19,10 +19,7 @@ class BoilerpipeArticle
|
|
19
19
|
end
|
20
20
|
|
21
21
|
html = doc.to_html.to_s
|
22
|
-
|
23
22
|
selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
|
24
|
-
|
25
|
-
|
26
23
|
time = Time.now.to_f
|
27
24
|
depth = 1
|
28
25
|
i = 0
|
@@ -74,7 +71,20 @@ class BoilerpipeArticle
|
|
74
71
|
articlesStats.each do |line,stats|
|
75
72
|
text = "#{text} #{stats[0]}" if stats[1] == best
|
76
73
|
end
|
77
|
-
|
78
74
|
return Nokogiri::HTML.parse(text).text
|
79
75
|
end
|
76
|
+
|
77
|
+
def getOgMetas(html = @html)
|
78
|
+
metas = Hash.new
|
79
|
+
doc = Nokogiri.parse(html)
|
80
|
+
properties = ['title','type','url','description','image','type','updated_time','locale','url','site_name']
|
81
|
+
properties.each do |prop|
|
82
|
+
if doc.at("meta[property=\"og:#{prop}\"]") != nil
|
83
|
+
metas.store(prop,doc.at("meta[property=\"og:#{prop}\"]")['content'])
|
84
|
+
else
|
85
|
+
metas.store(prop,' ')
|
86
|
+
end
|
87
|
+
end
|
88
|
+
return metas
|
89
|
+
end
|
80
90
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: BoilerpipeArticle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Layer-Reiss
|
@@ -25,17 +25,17 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.6.8
|
27
27
|
description: This gem removes the surplus “clutter” (boilerplate, templates) around
|
28
|
-
the main textual content of a web page (pure Ruby implementation).
|
29
|
-
for usage examples.
|
28
|
+
the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
|
29
|
+
can be also used to parse open graph meta data. Check GitHub for usage examples.
|
30
30
|
email: layerreiss@gmail.com
|
31
31
|
executables: []
|
32
32
|
extensions: []
|
33
33
|
extra_rdoc_files: []
|
34
34
|
files:
|
35
35
|
- lib/boilerpipe_article.rb
|
36
|
-
homepage:
|
36
|
+
homepage: http://peppersoft.net/
|
37
37
|
licenses:
|
38
|
-
-
|
38
|
+
- GPL-2.0
|
39
39
|
metadata: {}
|
40
40
|
post_install_message:
|
41
41
|
rdoc_options: []
|