webtractor 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
4
+ YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
5
5
  data.tar.gz: !binary |-
6
- ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
6
+ MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
10
- YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
11
- MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
9
+ MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
10
+ ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
11
+ Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
12
12
  data.tar.gz: !binary |-
13
- OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
14
- ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
15
- YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
13
+ YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
14
+ ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
15
+ ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Webtractor
2
2
 
3
3
  The Webtractor is a ruby library which is able to extract main content
4
- from webpages like news, blogs, etc. As a result you can just a main
4
+ from webpages like news, blogs, etc. As a result you can just have a main
5
5
  content without any boilerplate (menu, footer, comments, etc).
6
6
 
7
7
  ## Installation
@@ -89,4 +89,4 @@ extractor.add_filter RemoveBolds.new
89
89
 
90
90
  ## License
91
91
 
92
- This library is distributed under the Bearware license.
92
+ This library is distributed under the Beerware license.
data/lib/webtractor.rb CHANGED
@@ -16,6 +16,7 @@ require 'webtractor/filters/remove_comments'
16
16
  require 'webtractor/filters/remove_noncontent'
17
17
  require 'webtractor/filters/remove_menus'
18
18
  require 'webtractor/filters/remove_footer'
19
+ require 'webtractor/filters/remove_header'
19
20
  require 'webtractor/filters/remove_embeds'
20
21
  require 'webtractor/filters/remove_smallest'
21
22
  require 'webtractor/filters/remove_empty'
@@ -11,6 +11,7 @@ module Webtractor::Filters
11
11
  RemoveNoncontent.new,
12
12
  RemoveMenus.new,
13
13
  RemoveFooter.new,
14
+ RemoveHeader.new,
14
15
  RemoveEmbeds.new,
15
16
  RemoveSmallest.new,
16
17
  RemoveEmpty.new,
@@ -3,6 +3,7 @@ module Webtractor::Filters
3
3
  def process page
4
4
  page.css('embed').remove
5
5
  page.css('object').remove
6
+ page.xpath('//*[contains(@class, "video")]').remove
6
7
  page
7
8
  end
8
9
  end
@@ -2,6 +2,8 @@ module Webtractor::Filters
2
2
  class RemoveFooter
3
3
  def process page
4
4
  page.css('footer').remove
5
+ page.xpath('//*[contains(@id, "footer")]').remove
6
+ page.xpath('//*[contains(@class, "footer")]').remove
5
7
  page
6
8
  end
7
9
  end
@@ -0,0 +1,10 @@
1
+ module Webtractor::Filters
2
+ class RemoveHeader
3
+ def process page
4
+ page.css('header').remove
5
+ page.xpath('//*[contains(@class, "header")]').remove
6
+ page.xpath('//*[contains(@id, "header")]').remove
7
+ page
8
+ end
9
+ end
10
+ end
@@ -2,8 +2,12 @@ module Webtractor::Filters
2
2
  class RemoveMenus
3
3
  def process page
4
4
  page.css('nav').remove
5
+ page.css('#pane').remove
6
+ page.css('#carousel').remove
7
+ page.css('#sidebar').remove
5
8
  page.css('.pane').remove
6
9
  page.css('.carousel').remove
10
+ page.css('.sidebar').remove
7
11
 
8
12
  page.css('ul').each do |ul|
9
13
  li_count = ul.css('li').count
@@ -17,7 +21,6 @@ module Webtractor::Filters
17
21
  end
18
22
 
19
23
  def explore path, node
20
-
21
24
  path += "/#{node.name}"
22
25
 
23
26
  node.children.each do |child|
@@ -27,7 +30,8 @@ module Webtractor::Filters
27
30
  return if node.name == 'p'
28
31
 
29
32
  links_count = node.xpath('./a').size
30
- if links_count > 0 && links_count.to_f/node.children.count > 0.3
33
+
34
+ if links_count > 0 && links_count.to_f/node.children.count >= 0.3
31
35
  node.remove
32
36
  end
33
37
  end
@@ -7,5 +7,9 @@ module Webtractor
7
7
  @text = xml.text
8
8
  @xml = xml
9
9
  end
10
+
11
+ def save filename
12
+ File.write(filename, @xml.to_s)
13
+ end
10
14
  end
11
15
  end
@@ -1,3 +1,3 @@
1
1
  module Webtractor
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-25 00:00:00.000000000 Z
11
+ date: 2014-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -73,6 +73,7 @@ files:
73
73
  - lib/webtractor/filters/remove_empty.rb
74
74
  - lib/webtractor/filters/remove_footer.rb
75
75
  - lib/webtractor/filters/remove_forms.rb
76
+ - lib/webtractor/filters/remove_header.rb
76
77
  - lib/webtractor/filters/remove_images.rb
77
78
  - lib/webtractor/filters/remove_menus.rb
78
79
  - lib/webtractor/filters/remove_noncontent.rb