BoilerpipeArticle 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/boilerpipe_article.rb +17 -7
  3. metadata +5 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2cbf7a8c4c979aecefeb76935ec831f8f7e010b
4
- data.tar.gz: 427bb161caf9302c93739c52ffdda87795d08ab1
3
+ metadata.gz: 86e9801c9b5e6eacf758aa4bbff2ec298313e163
4
+ data.tar.gz: 58bdb8a6a7624e08f5f28f8bbbc094da2a3fc02a
5
5
  SHA512:
6
- metadata.gz: d7268bf78155ffba0641a79c2aa83b36953f965587fe404ea96b41fbf7b960b7c70189a22690eedd9993ef2cd0f04d12408718a84e2a3ef5db445636be7f6fe7
7
- data.tar.gz: 8366fd8c1ad057b7bb3718109c6e9070e60dd68e06fc4372e9a16f1ed72c9dd7527039e46c841a4f6aafa86d8ae72829f8f1775c1d85d76f1ad9a8ddd40fd24a
6
+ metadata.gz: f8dcba151737a254fb7f0840d24d187a757575ed87c8b0817ffb24dd3d3e3b28bebacf7ac4ee980bf8bc085317c9f3ae94a651c7c5b77fee43511f1a615077ab
7
+ data.tar.gz: e3a3ed339a688ae3ed778cd4f4c2e2e426ea070b94e07dcc156d945731b38961540f7a46fa4d421dbdac21aa47022c70df6586f6458bf4f620b95e3628278cd5
@@ -5,13 +5,13 @@ class BoilerpipeArticle
5
5
  @html = html
6
6
  end
7
7
 
8
- def run
9
- html = Nokogiri::HTML.parse(@html).to_s
8
+ def getText(html = @html)
9
+ html = Nokogiri::HTML.parse(html).to_s
10
10
  html.gsub!(/<!-[\s\S]*?->/, '')
11
11
  html.gsub!(/\r?\n|\r/, '')
12
12
 
13
13
  doc = Nokogiri::HTML(html)
14
- badHtmlTags = ['li','ol','ul','head','script','style','a','img']
14
+ badHtmlTags = ['nav','head','script','style','a','img']
15
15
  badHtmlTags.each do |tag|
16
16
  doc.search(tag).each do |src|
17
17
  src.remove
@@ -19,10 +19,7 @@ class BoilerpipeArticle
19
19
  end
20
20
 
21
21
  html = doc.to_html.to_s
22
-
23
22
  selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
24
-
25
-
26
23
  time = Time.now.to_f
27
24
  depth = 1
28
25
  i = 0
@@ -74,7 +71,20 @@ class BoilerpipeArticle
74
71
  articlesStats.each do |line,stats|
75
72
  text = "#{text} #{stats[0]}" if stats[1] == best
76
73
  end
77
-
78
74
  return Nokogiri::HTML.parse(text).text
79
75
  end
76
+
77
+ def getOgMetas(html = @html)
78
+ metas = Hash.new
79
+ doc = Nokogiri.parse(html)
80
+ properties = ['title','type','url','description','image','type','updated_time','locale','url','site_name']
81
+ properties.each do |prop|
82
+ if doc.at("meta[property=\"og:#{prop}\"]") != nil
83
+ metas.store(prop,doc.at("meta[property=\"og:#{prop}\"]")['content'])
84
+ else
85
+ metas.store(prop,' ')
86
+ end
87
+ end
88
+ return metas
89
+ end
80
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: BoilerpipeArticle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Layer-Reiss
@@ -25,17 +25,17 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.8
27
27
  description: This gem removes the surplus “clutter” (boilerplate, templates) around
28
- the main textual content of a web page (pure Ruby implementation). Check GitHub
29
- for usage examples.
28
+ the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
29
+ can be also used to parse open graph meta data. Check GitHub for usage examples.
30
30
  email: layerreiss@gmail.com
31
31
  executables: []
32
32
  extensions: []
33
33
  extra_rdoc_files: []
34
34
  files:
35
35
  - lib/boilerpipe_article.rb
36
- homepage: https://github.com/davidlr99/BoilerpipeArticle
36
+ homepage: http://peppersoft.net/
37
37
  licenses:
38
- - MIT
38
+ - GPL-2.0
39
39
  metadata: {}
40
40
  post_install_message:
41
41
  rdoc_options: []