webtractor 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE +8 -0
- data/README.md +92 -0
- data/lib/webtractor/extractor.rb +47 -0
- data/lib/webtractor/filters/biggest_block.rb +37 -0
- data/lib/webtractor/filters/default_filter.rb +22 -0
- data/lib/webtractor/filters/filter_group.rb +18 -0
- data/lib/webtractor/filters/remove_attrs.rb +12 -0
- data/lib/webtractor/filters/remove_comments.rb +9 -0
- data/lib/webtractor/filters/remove_embeds.rb +9 -0
- data/lib/webtractor/filters/remove_empty.rb +18 -0
- data/lib/webtractor/filters/remove_footer.rb +8 -0
- data/lib/webtractor/filters/remove_forms.rb +8 -0
- data/lib/webtractor/filters/remove_images.rb +8 -0
- data/lib/webtractor/filters/remove_menus.rb +35 -0
- data/lib/webtractor/filters/remove_noncontent.rb +9 -0
- data/lib/webtractor/filters/remove_noncontent_elements.rb +9 -0
- data/lib/webtractor/filters/remove_scripts.rb +9 -0
- data/lib/webtractor/filters/remove_smallest.rb +19 -0
- data/lib/webtractor/filters/remove_styles.rb +8 -0
- data/lib/webtractor/filters/remove_tables.rb +8 -0
- data/lib/webtractor/result.rb +11 -0
- data/lib/webtractor/version.rb +3 -0
- data/lib/webtractor.rb +29 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
|
10
|
+
YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
|
11
|
+
MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
|
14
|
+
ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
|
15
|
+
YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
|
data/LICENSE
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
/*
|
2
|
+
* ----------------------------------------------------------------------------
|
3
|
+
* "THE BEER-WARE LICENSE" (Revision 42):
|
4
|
+
* As long as you retain this notice you can do whatever you want with this
|
5
|
+
* stuff. If we meet some day, and you think this stuff is worth it, you can
|
6
|
+
* buy me a beer in return. Rene Klacan
|
7
|
+
* ----------------------------------------------------------------------------
|
8
|
+
*/
|
data/README.md
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# Webtractor
|
2
|
+
|
3
|
+
The Webtractor is a ruby library which is able to extract main content
|
4
|
+
from webpages like news, blogs, etc. As a result you can just a main
|
5
|
+
content without any boilerplate (menu, footer, comments, etc).
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
You can install it directly via gem:
|
10
|
+
|
11
|
+
```
|
12
|
+
gem install webtractor
|
13
|
+
```
|
14
|
+
|
15
|
+
Or you can put it in your Gemfile:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
gem 'webtractor'
|
19
|
+
```
|
20
|
+
|
21
|
+
Then run:
|
22
|
+
|
23
|
+
```
|
24
|
+
bundle install
|
25
|
+
```
|
26
|
+
|
27
|
+
## Basic usage
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
extractor = Webtractor::Extractor.new
|
31
|
+
result = extractor.extract_from_url
|
32
|
+
'http://techcrunch.com/2014/05/24/dont-believe-anyone-who-tells-you-learning-to-code-is-easy/'
|
33
|
+
puts result.text
|
34
|
+
```
|
35
|
+
|
36
|
+
Or
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
extractor = Webtractor::Extractor.new
|
40
|
+
result = extractor.extract '<html><body>...</body></html>'
|
41
|
+
```
|
42
|
+
|
43
|
+
Or
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
page = Nokogiri::HTML(...)
|
47
|
+
extractor = Webtractor::Extractor.new
|
48
|
+
result = extractor.extract_from_xml page
|
49
|
+
```
|
50
|
+
|
51
|
+
You can also access Nokogiri document from result via xml attribute:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
puts result.xml.xpath('...').text
|
55
|
+
```
|
56
|
+
|
57
|
+
## Advanced usage
|
58
|
+
|
59
|
+
Process of getting main content from the webpage is really simple. It
|
60
|
+
consists of applying multiple filters on the document where every filter
|
61
|
+
gets on input output of the last applied filter.
|
62
|
+
|
63
|
+
You can look at the names of default filters:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
p Webtractor::Filters::DefaultFilter.new.filters.map{|f| f.class.to_s}
|
67
|
+
```
|
68
|
+
|
69
|
+
You can remove any filter:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
extractor.remove_filter Webtractor::Filters::RemoveComments
|
73
|
+
```
|
74
|
+
|
75
|
+
Or you can also create your own filter. It can be any class which
|
76
|
+
implements *process* method which takes page as an argument and returns
|
77
|
+
page:
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
class RemoveBolds
|
81
|
+
def process page
|
82
|
+
page.css('b').remove
|
83
|
+
page
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
extractor.add_filter RemoveBolds.new
|
88
|
+
```
|
89
|
+
|
90
|
+
## License
|
91
|
+
|
92
|
+
This library is distributed under the Bearware license.
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Webtractor
|
2
|
+
class Extractor
|
3
|
+
attr_accessor :filters
|
4
|
+
|
5
|
+
def initialize params={}
|
6
|
+
@filters = params[:filters] || [Filters::DefaultFilter.new]
|
7
|
+
@cache = params[:cache] || false
|
8
|
+
@cache_params = params[:cache_params] || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def extract text
|
12
|
+
extract_from_xml(Nokogiri::HTML(text))
|
13
|
+
end
|
14
|
+
|
15
|
+
def extract_from_xml page
|
16
|
+
title = page.xpath('//head/title').text
|
17
|
+
@filters.each do |filter|
|
18
|
+
page = filter.process(page)
|
19
|
+
end
|
20
|
+
Result.new(title, page)
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_from_url url
|
24
|
+
content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
|
25
|
+
open(url).read
|
26
|
+
end
|
27
|
+
extract(content)
|
28
|
+
end
|
29
|
+
|
30
|
+
def add_filter filter
|
31
|
+
if filter.is_a?(Class)
|
32
|
+
@filters << filter.new
|
33
|
+
else
|
34
|
+
@filters << filter
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def remove_filter filter
|
39
|
+
filter = filter.class unless filter.is_a?(Class)
|
40
|
+
@filters = @filters.reject!{|f| f.is_a?(filter)}
|
41
|
+
end
|
42
|
+
|
43
|
+
def clear_filters
|
44
|
+
@filters.clear
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class BiggestBlock
|
3
|
+
def initialize threshold=50.0
|
4
|
+
@threshold = threshold
|
5
|
+
end
|
6
|
+
|
7
|
+
def process page
|
8
|
+
@nodes = {}
|
9
|
+
explore(page.name, page.at('body'))
|
10
|
+
@nodes = Hash[@nodes.sort.reverse]
|
11
|
+
|
12
|
+
max = @nodes.keys[0]
|
13
|
+
last_percents = 100.0
|
14
|
+
last_node = @nodes.values[0]
|
15
|
+
|
16
|
+
@nodes.each do |size, node|
|
17
|
+
percents = size.to_f/max*100
|
18
|
+
diff = last_percents - percents
|
19
|
+
return last_node if diff > @threshold
|
20
|
+
last_percents = percents
|
21
|
+
last_node = node
|
22
|
+
end
|
23
|
+
page
|
24
|
+
end
|
25
|
+
|
26
|
+
def explore path, node
|
27
|
+
path += "/#{node.name}"
|
28
|
+
size = node.text ? node.text.size : 0
|
29
|
+
|
30
|
+
@nodes[size] = node
|
31
|
+
|
32
|
+
node.children.each do |child|
|
33
|
+
explore(path, child)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class DefaultFilter < FilterGroup
|
3
|
+
def filters
|
4
|
+
[
|
5
|
+
RemoveScripts.new,
|
6
|
+
RemoveStyles.new,
|
7
|
+
RemoveImages.new,
|
8
|
+
RemoveForms.new,
|
9
|
+
RemoveTables.new,
|
10
|
+
RemoveComments.new,
|
11
|
+
RemoveNoncontent.new,
|
12
|
+
RemoveMenus.new,
|
13
|
+
RemoveFooter.new,
|
14
|
+
RemoveEmbeds.new,
|
15
|
+
RemoveSmallest.new,
|
16
|
+
RemoveEmpty.new,
|
17
|
+
RemoveAttrs.new,
|
18
|
+
BiggestBlock.new,
|
19
|
+
]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class FilterGroup
|
3
|
+
def initialize fs=nil
|
4
|
+
@filters = fs || filters
|
5
|
+
end
|
6
|
+
|
7
|
+
def filters
|
8
|
+
[]
|
9
|
+
end
|
10
|
+
|
11
|
+
def process page
|
12
|
+
@filters.each do |filter|
|
13
|
+
page = filter.process(page)
|
14
|
+
end
|
15
|
+
page
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveEmpty
|
3
|
+
def process page
|
4
|
+
explore(page.name, page.at('body'))
|
5
|
+
page
|
6
|
+
end
|
7
|
+
|
8
|
+
def explore path, node
|
9
|
+
path += "/#{node.name}"
|
10
|
+
|
11
|
+
node.children.each do |child|
|
12
|
+
explore(path, child)
|
13
|
+
end
|
14
|
+
|
15
|
+
node.remove if node.text.nil? || node.text.strip == ''
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveMenus
|
3
|
+
def process page
|
4
|
+
page.css('nav').remove
|
5
|
+
page.css('.pane').remove
|
6
|
+
page.css('.carousel').remove
|
7
|
+
|
8
|
+
page.css('ul').each do |ul|
|
9
|
+
li_count = ul.css('li').count
|
10
|
+
a_count = ul.xpath('./a[@href]').count
|
11
|
+
ul.remove if a_count >= li_count.to_f/2
|
12
|
+
end
|
13
|
+
|
14
|
+
explore(page.name, page.at('body'))
|
15
|
+
|
16
|
+
page
|
17
|
+
end
|
18
|
+
|
19
|
+
def explore path, node
|
20
|
+
|
21
|
+
path += "/#{node.name}"
|
22
|
+
|
23
|
+
node.children.each do |child|
|
24
|
+
explore(path, child)
|
25
|
+
end
|
26
|
+
|
27
|
+
return if node.name == 'p'
|
28
|
+
|
29
|
+
links_count = node.xpath('./a').size
|
30
|
+
if links_count > 0 && links_count.to_f/node.children.count > 0.3
|
31
|
+
node.remove
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveSmallest
|
3
|
+
def process page
|
4
|
+
explore(page.name, page.at('body'))
|
5
|
+
page
|
6
|
+
end
|
7
|
+
|
8
|
+
def explore path, node
|
9
|
+
path += "/#{node.name}"
|
10
|
+
words = (node.text || '').split
|
11
|
+
|
12
|
+
node.children.each do |child|
|
13
|
+
explore(path, child)
|
14
|
+
end
|
15
|
+
|
16
|
+
node.remove if words.count < node.children.count
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/webtractor.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'cachy'
|
4
|
+
require 'moneta'
|
5
|
+
|
6
|
+
require 'webtractor/extractor'
|
7
|
+
require 'webtractor/result'
|
8
|
+
require 'webtractor/filters/filter_group'
|
9
|
+
require 'webtractor/filters/default_filter'
|
10
|
+
require 'webtractor/filters/remove_scripts'
|
11
|
+
require 'webtractor/filters/remove_styles'
|
12
|
+
require 'webtractor/filters/remove_images'
|
13
|
+
require 'webtractor/filters/remove_forms'
|
14
|
+
require 'webtractor/filters/remove_tables'
|
15
|
+
require 'webtractor/filters/remove_comments'
|
16
|
+
require 'webtractor/filters/remove_noncontent'
|
17
|
+
require 'webtractor/filters/remove_menus'
|
18
|
+
require 'webtractor/filters/remove_footer'
|
19
|
+
require 'webtractor/filters/remove_embeds'
|
20
|
+
require 'webtractor/filters/remove_smallest'
|
21
|
+
require 'webtractor/filters/remove_empty'
|
22
|
+
require 'webtractor/filters/remove_attrs'
|
23
|
+
require 'webtractor/filters/biggest_block'
|
24
|
+
|
25
|
+
begin
|
26
|
+
Cachy.cache_store
|
27
|
+
rescue RuntimeError
|
28
|
+
Cachy.cache_store = Moneta.new(:File, dir: '/tmp/webtractor.cache')
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webtractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rene Klacan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: cachy
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: moneta
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: The Webtractor library can extract main content from websites like news,
|
56
|
+
blogs, etc without unwanted boilerplate (menus, footer, comments)
|
57
|
+
email:
|
58
|
+
- rene@klacan.sk
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- LICENSE
|
64
|
+
- README.md
|
65
|
+
- lib/webtractor.rb
|
66
|
+
- lib/webtractor/extractor.rb
|
67
|
+
- lib/webtractor/filters/biggest_block.rb
|
68
|
+
- lib/webtractor/filters/default_filter.rb
|
69
|
+
- lib/webtractor/filters/filter_group.rb
|
70
|
+
- lib/webtractor/filters/remove_attrs.rb
|
71
|
+
- lib/webtractor/filters/remove_comments.rb
|
72
|
+
- lib/webtractor/filters/remove_embeds.rb
|
73
|
+
- lib/webtractor/filters/remove_empty.rb
|
74
|
+
- lib/webtractor/filters/remove_footer.rb
|
75
|
+
- lib/webtractor/filters/remove_forms.rb
|
76
|
+
- lib/webtractor/filters/remove_images.rb
|
77
|
+
- lib/webtractor/filters/remove_menus.rb
|
78
|
+
- lib/webtractor/filters/remove_noncontent.rb
|
79
|
+
- lib/webtractor/filters/remove_noncontent_elements.rb
|
80
|
+
- lib/webtractor/filters/remove_scripts.rb
|
81
|
+
- lib/webtractor/filters/remove_smallest.rb
|
82
|
+
- lib/webtractor/filters/remove_styles.rb
|
83
|
+
- lib/webtractor/filters/remove_tables.rb
|
84
|
+
- lib/webtractor/result.rb
|
85
|
+
- lib/webtractor/version.rb
|
86
|
+
homepage: https://github.com/reneklacan/webtractor
|
87
|
+
licenses:
|
88
|
+
- MIT
|
89
|
+
metadata: {}
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
require_paths:
|
93
|
+
- lib
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '1.9'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 2.2.2
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: The Webtractor library can extract main content from websites like news,
|
110
|
+
blogs, etc without unwanted boilerplate (menus, footer, comments)
|
111
|
+
test_files: []
|
112
|
+
has_rdoc:
|