webtractor 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
4
+ YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
5
5
  data.tar.gz: !binary |-
6
- ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
6
+ MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
10
- YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
11
- MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
9
+ MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
10
+ ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
11
+ Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
12
12
  data.tar.gz: !binary |-
13
- OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
14
- ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
15
- YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
13
+ YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
14
+ ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
15
+ ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Webtractor
2
2
 
3
3
  The Webtractor is a ruby library which is able to extract main content
4
- from webpages like news, blogs, etc. As a result you can just a main
4
+ from webpages like news, blogs, etc. As a result you can just have a main
5
5
  content without any boilerplate (menu, footer, comments, etc).
6
6
 
7
7
  ## Installation
@@ -89,4 +89,4 @@ extractor.add_filter RemoveBolds.new
89
89
 
90
90
  ## License
91
91
 
92
- This library is distributed under the Bearware license.
92
+ This library is distributed under the Beerware license.
data/lib/webtractor.rb CHANGED
@@ -16,6 +16,7 @@ require 'webtractor/filters/remove_comments'
16
16
  require 'webtractor/filters/remove_noncontent'
17
17
  require 'webtractor/filters/remove_menus'
18
18
  require 'webtractor/filters/remove_footer'
19
+ require 'webtractor/filters/remove_header'
19
20
  require 'webtractor/filters/remove_embeds'
20
21
  require 'webtractor/filters/remove_smallest'
21
22
  require 'webtractor/filters/remove_empty'
@@ -11,6 +11,7 @@ module Webtractor::Filters
11
11
  RemoveNoncontent.new,
12
12
  RemoveMenus.new,
13
13
  RemoveFooter.new,
14
+ RemoveHeader.new,
14
15
  RemoveEmbeds.new,
15
16
  RemoveSmallest.new,
16
17
  RemoveEmpty.new,
@@ -3,6 +3,7 @@ module Webtractor::Filters
3
3
  def process page
4
4
  page.css('embed').remove
5
5
  page.css('object').remove
6
+ page.xpath('//*[contains(@class, "video")]').remove
6
7
  page
7
8
  end
8
9
  end
@@ -2,6 +2,8 @@ module Webtractor::Filters
2
2
  class RemoveFooter
3
3
  def process page
4
4
  page.css('footer').remove
5
+ page.xpath('//*[contains(@id, "footer")]').remove
6
+ page.xpath('//*[contains(@class, "footer")]').remove
5
7
  page
6
8
  end
7
9
  end
@@ -0,0 +1,10 @@
1
+ module Webtractor::Filters
2
+ class RemoveHeader
3
+ def process page
4
+ page.css('header').remove
5
+ page.xpath('//*[contains(@class, "header")]').remove
6
+ page.xpath('//*[contains(@id, "header")]').remove
7
+ page
8
+ end
9
+ end
10
+ end
@@ -2,8 +2,12 @@ module Webtractor::Filters
2
2
  class RemoveMenus
3
3
  def process page
4
4
  page.css('nav').remove
5
+ page.css('#pane').remove
6
+ page.css('#carousel').remove
7
+ page.css('#sidebar').remove
5
8
  page.css('.pane').remove
6
9
  page.css('.carousel').remove
10
+ page.css('.sidebar').remove
7
11
 
8
12
  page.css('ul').each do |ul|
9
13
  li_count = ul.css('li').count
@@ -17,7 +21,6 @@ module Webtractor::Filters
17
21
  end
18
22
 
19
23
  def explore path, node
20
-
21
24
  path += "/#{node.name}"
22
25
 
23
26
  node.children.each do |child|
@@ -27,7 +30,8 @@ module Webtractor::Filters
27
30
  return if node.name == 'p'
28
31
 
29
32
  links_count = node.xpath('./a').size
30
- if links_count > 0 && links_count.to_f/node.children.count > 0.3
33
+
34
+ if links_count > 0 && links_count.to_f/node.children.count >= 0.3
31
35
  node.remove
32
36
  end
33
37
  end
@@ -7,5 +7,9 @@ module Webtractor
7
7
  @text = xml.text
8
8
  @xml = xml
9
9
  end
10
+
11
+ def save filename
12
+ File.write(filename, @xml.to_s)
13
+ end
10
14
  end
11
15
  end
@@ -1,3 +1,3 @@
1
1
  module Webtractor
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-25 00:00:00.000000000 Z
11
+ date: 2014-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -73,6 +73,7 @@ files:
73
73
  - lib/webtractor/filters/remove_empty.rb
74
74
  - lib/webtractor/filters/remove_footer.rb
75
75
  - lib/webtractor/filters/remove_forms.rb
76
+ - lib/webtractor/filters/remove_header.rb
76
77
  - lib/webtractor/filters/remove_images.rb
77
78
  - lib/webtractor/filters/remove_menus.rb
78
79
  - lib/webtractor/filters/remove_noncontent.rb