webtractor 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +2 -2
- data/lib/webtractor.rb +1 -0
- data/lib/webtractor/filters/default_filter.rb +1 -0
- data/lib/webtractor/filters/remove_embeds.rb +1 -0
- data/lib/webtractor/filters/remove_footer.rb +2 -0
- data/lib/webtractor/filters/remove_header.rb +10 -0
- data/lib/webtractor/filters/remove_menus.rb +6 -2
- data/lib/webtractor/result.rb +4 -0
- data/lib/webtractor/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
|
10
|
+
ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
|
11
|
+
Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
|
14
|
+
ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
|
15
|
+
ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Webtractor
|
2
2
|
|
3
3
|
The Webtractor is a ruby library which is able to extract main content
|
4
|
-
from webpages like news, blogs, etc. As a result you can just a main
|
4
|
+
from webpages like news, blogs, etc. As a result you can just have a main
|
5
5
|
content without any boilerplate (menu, footer, comments, etc).
|
6
6
|
|
7
7
|
## Installation
|
@@ -89,4 +89,4 @@ extractor.add_filter RemoveBolds.new
|
|
89
89
|
|
90
90
|
## License
|
91
91
|
|
92
|
-
This library is distributed under the
|
92
|
+
This library is distributed under the Beerware license.
|
data/lib/webtractor.rb
CHANGED
@@ -16,6 +16,7 @@ require 'webtractor/filters/remove_comments'
|
|
16
16
|
require 'webtractor/filters/remove_noncontent'
|
17
17
|
require 'webtractor/filters/remove_menus'
|
18
18
|
require 'webtractor/filters/remove_footer'
|
19
|
+
require 'webtractor/filters/remove_header'
|
19
20
|
require 'webtractor/filters/remove_embeds'
|
20
21
|
require 'webtractor/filters/remove_smallest'
|
21
22
|
require 'webtractor/filters/remove_empty'
|
@@ -2,8 +2,12 @@ module Webtractor::Filters
|
|
2
2
|
class RemoveMenus
|
3
3
|
def process page
|
4
4
|
page.css('nav').remove
|
5
|
+
page.css('#pane').remove
|
6
|
+
page.css('#carousel').remove
|
7
|
+
page.css('#sidebar').remove
|
5
8
|
page.css('.pane').remove
|
6
9
|
page.css('.carousel').remove
|
10
|
+
page.css('.sidebar').remove
|
7
11
|
|
8
12
|
page.css('ul').each do |ul|
|
9
13
|
li_count = ul.css('li').count
|
@@ -17,7 +21,6 @@ module Webtractor::Filters
|
|
17
21
|
end
|
18
22
|
|
19
23
|
def explore path, node
|
20
|
-
|
21
24
|
path += "/#{node.name}"
|
22
25
|
|
23
26
|
node.children.each do |child|
|
@@ -27,7 +30,8 @@ module Webtractor::Filters
|
|
27
30
|
return if node.name == 'p'
|
28
31
|
|
29
32
|
links_count = node.xpath('./a').size
|
30
|
-
|
33
|
+
|
34
|
+
if links_count > 0 && links_count.to_f/node.children.count >= 0.3
|
31
35
|
node.remove
|
32
36
|
end
|
33
37
|
end
|
data/lib/webtractor/result.rb
CHANGED
data/lib/webtractor/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webtractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -73,6 +73,7 @@ files:
|
|
73
73
|
- lib/webtractor/filters/remove_empty.rb
|
74
74
|
- lib/webtractor/filters/remove_footer.rb
|
75
75
|
- lib/webtractor/filters/remove_forms.rb
|
76
|
+
- lib/webtractor/filters/remove_header.rb
|
76
77
|
- lib/webtractor/filters/remove_images.rb
|
77
78
|
- lib/webtractor/filters/remove_menus.rb
|
78
79
|
- lib/webtractor/filters/remove_noncontent.rb
|