webtractor 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +2 -2
- data/lib/webtractor.rb +1 -0
- data/lib/webtractor/filters/default_filter.rb +1 -0
- data/lib/webtractor/filters/remove_embeds.rb +1 -0
- data/lib/webtractor/filters/remove_footer.rb +2 -0
- data/lib/webtractor/filters/remove_header.rb +10 -0
- data/lib/webtractor/filters/remove_menus.rb +6 -2
- data/lib/webtractor/result.rb +4 -0
- data/lib/webtractor/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
|
10
|
+
ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
|
11
|
+
Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
|
14
|
+
ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
|
15
|
+
ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Webtractor
|
2
2
|
|
3
3
|
The Webtractor is a ruby library which is able to extract main content
|
4
|
-
from webpages like news, blogs, etc. As a result you can just a main
|
4
|
+
from webpages like news, blogs, etc. As a result you can just have a main
|
5
5
|
content without any boilerplate (menu, footer, comments, etc).
|
6
6
|
|
7
7
|
## Installation
|
@@ -89,4 +89,4 @@ extractor.add_filter RemoveBolds.new
|
|
89
89
|
|
90
90
|
## License
|
91
91
|
|
92
|
-
This library is distributed under the
|
92
|
+
This library is distributed under the Beerware license.
|
data/lib/webtractor.rb
CHANGED
@@ -16,6 +16,7 @@ require 'webtractor/filters/remove_comments'
|
|
16
16
|
require 'webtractor/filters/remove_noncontent'
|
17
17
|
require 'webtractor/filters/remove_menus'
|
18
18
|
require 'webtractor/filters/remove_footer'
|
19
|
+
require 'webtractor/filters/remove_header'
|
19
20
|
require 'webtractor/filters/remove_embeds'
|
20
21
|
require 'webtractor/filters/remove_smallest'
|
21
22
|
require 'webtractor/filters/remove_empty'
|
@@ -2,8 +2,12 @@ module Webtractor::Filters
|
|
2
2
|
class RemoveMenus
|
3
3
|
def process page
|
4
4
|
page.css('nav').remove
|
5
|
+
page.css('#pane').remove
|
6
|
+
page.css('#carousel').remove
|
7
|
+
page.css('#sidebar').remove
|
5
8
|
page.css('.pane').remove
|
6
9
|
page.css('.carousel').remove
|
10
|
+
page.css('.sidebar').remove
|
7
11
|
|
8
12
|
page.css('ul').each do |ul|
|
9
13
|
li_count = ul.css('li').count
|
@@ -17,7 +21,6 @@ module Webtractor::Filters
|
|
17
21
|
end
|
18
22
|
|
19
23
|
def explore path, node
|
20
|
-
|
21
24
|
path += "/#{node.name}"
|
22
25
|
|
23
26
|
node.children.each do |child|
|
@@ -27,7 +30,8 @@ module Webtractor::Filters
|
|
27
30
|
return if node.name == 'p'
|
28
31
|
|
29
32
|
links_count = node.xpath('./a').size
|
30
|
-
|
33
|
+
|
34
|
+
if links_count > 0 && links_count.to_f/node.children.count >= 0.3
|
31
35
|
node.remove
|
32
36
|
end
|
33
37
|
end
|
data/lib/webtractor/result.rb
CHANGED
data/lib/webtractor/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webtractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -73,6 +73,7 @@ files:
|
|
73
73
|
- lib/webtractor/filters/remove_empty.rb
|
74
74
|
- lib/webtractor/filters/remove_footer.rb
|
75
75
|
- lib/webtractor/filters/remove_forms.rb
|
76
|
+
- lib/webtractor/filters/remove_header.rb
|
76
77
|
- lib/webtractor/filters/remove_images.rb
|
77
78
|
- lib/webtractor/filters/remove_menus.rb
|
78
79
|
- lib/webtractor/filters/remove_noncontent.rb
|