BoilerpipeArticle 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/boilerpipe_article.rb +17 -7
  3. metadata +5 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2cbf7a8c4c979aecefeb76935ec831f8f7e010b
4
- data.tar.gz: 427bb161caf9302c93739c52ffdda87795d08ab1
3
+ metadata.gz: 86e9801c9b5e6eacf758aa4bbff2ec298313e163
4
+ data.tar.gz: 58bdb8a6a7624e08f5f28f8bbbc094da2a3fc02a
5
5
  SHA512:
6
- metadata.gz: d7268bf78155ffba0641a79c2aa83b36953f965587fe404ea96b41fbf7b960b7c70189a22690eedd9993ef2cd0f04d12408718a84e2a3ef5db445636be7f6fe7
7
- data.tar.gz: 8366fd8c1ad057b7bb3718109c6e9070e60dd68e06fc4372e9a16f1ed72c9dd7527039e46c841a4f6aafa86d8ae72829f8f1775c1d85d76f1ad9a8ddd40fd24a
6
+ metadata.gz: f8dcba151737a254fb7f0840d24d187a757575ed87c8b0817ffb24dd3d3e3b28bebacf7ac4ee980bf8bc085317c9f3ae94a651c7c5b77fee43511f1a615077ab
7
+ data.tar.gz: e3a3ed339a688ae3ed778cd4f4c2e2e426ea070b94e07dcc156d945731b38961540f7a46fa4d421dbdac21aa47022c70df6586f6458bf4f620b95e3628278cd5
@@ -5,13 +5,13 @@ class BoilerpipeArticle
5
5
  @html = html
6
6
  end
7
7
 
8
- def run
9
- html = Nokogiri::HTML.parse(@html).to_s
8
+ def getText(html = @html)
9
+ html = Nokogiri::HTML.parse(html).to_s
10
10
  html.gsub!(/<!-[\s\S]*?->/, '')
11
11
  html.gsub!(/\r?\n|\r/, '')
12
12
 
13
13
  doc = Nokogiri::HTML(html)
14
- badHtmlTags = ['li','ol','ul','head','script','style','a','img']
14
+ badHtmlTags = ['nav','head','script','style','a','img']
15
15
  badHtmlTags.each do |tag|
16
16
  doc.search(tag).each do |src|
17
17
  src.remove
@@ -19,10 +19,7 @@ class BoilerpipeArticle
19
19
  end
20
20
 
21
21
  html = doc.to_html.to_s
22
-
23
22
  selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
24
-
25
-
26
23
  time = Time.now.to_f
27
24
  depth = 1
28
25
  i = 0
@@ -74,7 +71,20 @@ class BoilerpipeArticle
74
71
  articlesStats.each do |line,stats|
75
72
  text = "#{text} #{stats[0]}" if stats[1] == best
76
73
  end
77
-
78
74
  return Nokogiri::HTML.parse(text).text
79
75
  end
76
+
77
+ def getOgMetas(html = @html)
78
+ metas = Hash.new
79
+ doc = Nokogiri.parse(html)
80
+ properties = ['title','type','url','description','image','type','updated_time','locale','url','site_name']
81
+ properties.each do |prop|
82
+ if doc.at("meta[property=\"og:#{prop}\"]") != nil
83
+ metas.store(prop,doc.at("meta[property=\"og:#{prop}\"]")['content'])
84
+ else
85
+ metas.store(prop,' ')
86
+ end
87
+ end
88
+ return metas
89
+ end
80
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: BoilerpipeArticle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Layer-Reiss
@@ -25,17 +25,17 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.8
27
27
  description: This gem removes the surplus “clutter” (boilerplate, templates) around
28
- the main textual content of a web page (pure Ruby implementation). Check GitHub
29
- for usage examples.
28
+ the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
29
+ can be also used to parse open graph meta data. Check GitHub for usage examples.
30
30
  email: layerreiss@gmail.com
31
31
  executables: []
32
32
  extensions: []
33
33
  extra_rdoc_files: []
34
34
  files:
35
35
  - lib/boilerpipe_article.rb
36
- homepage: https://github.com/davidlr99/BoilerpipeArticle
36
+ homepage: http://peppersoft.net/
37
37
  licenses:
38
- - MIT
38
+ - GPL-2.0
39
39
  metadata: {}
40
40
  post_install_message:
41
41
  rdoc_options: []