webtractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE +8 -0
- data/README.md +92 -0
- data/lib/webtractor/extractor.rb +47 -0
- data/lib/webtractor/filters/biggest_block.rb +37 -0
- data/lib/webtractor/filters/default_filter.rb +22 -0
- data/lib/webtractor/filters/filter_group.rb +18 -0
- data/lib/webtractor/filters/remove_attrs.rb +12 -0
- data/lib/webtractor/filters/remove_comments.rb +9 -0
- data/lib/webtractor/filters/remove_embeds.rb +9 -0
- data/lib/webtractor/filters/remove_empty.rb +18 -0
- data/lib/webtractor/filters/remove_footer.rb +8 -0
- data/lib/webtractor/filters/remove_forms.rb +8 -0
- data/lib/webtractor/filters/remove_images.rb +8 -0
- data/lib/webtractor/filters/remove_menus.rb +35 -0
- data/lib/webtractor/filters/remove_noncontent.rb +9 -0
- data/lib/webtractor/filters/remove_noncontent_elements.rb +9 -0
- data/lib/webtractor/filters/remove_scripts.rb +9 -0
- data/lib/webtractor/filters/remove_smallest.rb +19 -0
- data/lib/webtractor/filters/remove_styles.rb +8 -0
- data/lib/webtractor/filters/remove_tables.rb +8 -0
- data/lib/webtractor/result.rb +11 -0
- data/lib/webtractor/version.rb +3 -0
- data/lib/webtractor.rb +29 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
|
10
|
+
YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
|
11
|
+
MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
|
14
|
+
ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
|
15
|
+
YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
|
data/LICENSE
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
/*
|
2
|
+
* ----------------------------------------------------------------------------
|
3
|
+
* "THE BEER-WARE LICENSE" (Revision 42):
|
4
|
+
* As long as you retain this notice you can do whatever you want with this
|
5
|
+
* stuff. If we meet some day, and you think this stuff is worth it, you can
|
6
|
+
* buy me a beer in return. Rene Klacan
|
7
|
+
* ----------------------------------------------------------------------------
|
8
|
+
*/
|
data/README.md
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# Webtractor
|
2
|
+
|
3
|
+
The Webtractor is a ruby library which is able to extract main content
|
4
|
+
from webpages like news, blogs, etc. As a result you can just a main
|
5
|
+
content without any boilerplate (menu, footer, comments, etc).
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
You can install it directly via gem:
|
10
|
+
|
11
|
+
```
|
12
|
+
gem install webtractor
|
13
|
+
```
|
14
|
+
|
15
|
+
Or you can put it in your Gemfile:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
gem 'webtractor'
|
19
|
+
```
|
20
|
+
|
21
|
+
Then run:
|
22
|
+
|
23
|
+
```
|
24
|
+
bundle install
|
25
|
+
```
|
26
|
+
|
27
|
+
## Basic usage
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
extractor = Webtractor::Extractor.new
|
31
|
+
result = extractor.extract_from_url
|
32
|
+
'http://techcrunch.com/2014/05/24/dont-believe-anyone-who-tells-you-learning-to-code-is-easy/'
|
33
|
+
puts result.text
|
34
|
+
```
|
35
|
+
|
36
|
+
Or
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
extractor = Webtractor::Extractor.new
|
40
|
+
result = extractor.extract '<html><body>...</body></html>'
|
41
|
+
```
|
42
|
+
|
43
|
+
Or
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
page = Nokogiri::HTML(...)
|
47
|
+
extractor = Webtractor::Extractor.new
|
48
|
+
result = extractor.extract_from_xml page
|
49
|
+
```
|
50
|
+
|
51
|
+
You can also access Nokogiri document from result via xml attribute:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
puts result.xml.xpath('...').text
|
55
|
+
```
|
56
|
+
|
57
|
+
## Advanced usage
|
58
|
+
|
59
|
+
Process of getting main content from the webpage is really simple. It
|
60
|
+
consists of applying multiple filters on the document where every filter
|
61
|
+
gets on input output of the last applied filter.
|
62
|
+
|
63
|
+
You can look at the names of default filters:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
p Webtractor::Filters::DefaultFilter.new.filters.map{|f| f.class.to_s}
|
67
|
+
```
|
68
|
+
|
69
|
+
You can remove any filter:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
extractor.remove_filter Webtractor::Filters::RemoveComments
|
73
|
+
```
|
74
|
+
|
75
|
+
Or you can also create your own filter. It can be any class which
|
76
|
+
implements *process* method which takes page as an argument and returns
|
77
|
+
page:
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
class RemoveBolds
|
81
|
+
def process page
|
82
|
+
page.css('b').remove
|
83
|
+
page
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
extractor.add_filter RemoveBolds.new
|
88
|
+
```
|
89
|
+
|
90
|
+
## License
|
91
|
+
|
92
|
+
This library is distributed under the Bearware license.
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Webtractor
|
2
|
+
class Extractor
|
3
|
+
attr_accessor :filters
|
4
|
+
|
5
|
+
def initialize params={}
|
6
|
+
@filters = params[:filters] || [Filters::DefaultFilter.new]
|
7
|
+
@cache = params[:cache] || false
|
8
|
+
@cache_params = params[:cache_params] || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def extract text
|
12
|
+
extract_from_xml(Nokogiri::HTML(text))
|
13
|
+
end
|
14
|
+
|
15
|
+
def extract_from_xml page
|
16
|
+
title = page.xpath('//head/title').text
|
17
|
+
@filters.each do |filter|
|
18
|
+
page = filter.process(page)
|
19
|
+
end
|
20
|
+
Result.new(title, page)
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_from_url url
|
24
|
+
content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
|
25
|
+
open(url).read
|
26
|
+
end
|
27
|
+
extract(content)
|
28
|
+
end
|
29
|
+
|
30
|
+
def add_filter filter
|
31
|
+
if filter.is_a?(Class)
|
32
|
+
@filters << filter.new
|
33
|
+
else
|
34
|
+
@filters << filter
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def remove_filter filter
|
39
|
+
filter = filter.class unless filter.is_a?(Class)
|
40
|
+
@filters = @filters.reject!{|f| f.is_a?(filter)}
|
41
|
+
end
|
42
|
+
|
43
|
+
def clear_filters
|
44
|
+
@filters.clear
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class BiggestBlock
|
3
|
+
def initialize threshold=50.0
|
4
|
+
@threshold = threshold
|
5
|
+
end
|
6
|
+
|
7
|
+
def process page
|
8
|
+
@nodes = {}
|
9
|
+
explore(page.name, page.at('body'))
|
10
|
+
@nodes = Hash[@nodes.sort.reverse]
|
11
|
+
|
12
|
+
max = @nodes.keys[0]
|
13
|
+
last_percents = 100.0
|
14
|
+
last_node = @nodes.values[0]
|
15
|
+
|
16
|
+
@nodes.each do |size, node|
|
17
|
+
percents = size.to_f/max*100
|
18
|
+
diff = last_percents - percents
|
19
|
+
return last_node if diff > @threshold
|
20
|
+
last_percents = percents
|
21
|
+
last_node = node
|
22
|
+
end
|
23
|
+
page
|
24
|
+
end
|
25
|
+
|
26
|
+
def explore path, node
|
27
|
+
path += "/#{node.name}"
|
28
|
+
size = node.text ? node.text.size : 0
|
29
|
+
|
30
|
+
@nodes[size] = node
|
31
|
+
|
32
|
+
node.children.each do |child|
|
33
|
+
explore(path, child)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class DefaultFilter < FilterGroup
|
3
|
+
def filters
|
4
|
+
[
|
5
|
+
RemoveScripts.new,
|
6
|
+
RemoveStyles.new,
|
7
|
+
RemoveImages.new,
|
8
|
+
RemoveForms.new,
|
9
|
+
RemoveTables.new,
|
10
|
+
RemoveComments.new,
|
11
|
+
RemoveNoncontent.new,
|
12
|
+
RemoveMenus.new,
|
13
|
+
RemoveFooter.new,
|
14
|
+
RemoveEmbeds.new,
|
15
|
+
RemoveSmallest.new,
|
16
|
+
RemoveEmpty.new,
|
17
|
+
RemoveAttrs.new,
|
18
|
+
BiggestBlock.new,
|
19
|
+
]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class FilterGroup
|
3
|
+
def initialize fs=nil
|
4
|
+
@filters = fs || filters
|
5
|
+
end
|
6
|
+
|
7
|
+
def filters
|
8
|
+
[]
|
9
|
+
end
|
10
|
+
|
11
|
+
def process page
|
12
|
+
@filters.each do |filter|
|
13
|
+
page = filter.process(page)
|
14
|
+
end
|
15
|
+
page
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveEmpty
|
3
|
+
def process page
|
4
|
+
explore(page.name, page.at('body'))
|
5
|
+
page
|
6
|
+
end
|
7
|
+
|
8
|
+
def explore path, node
|
9
|
+
path += "/#{node.name}"
|
10
|
+
|
11
|
+
node.children.each do |child|
|
12
|
+
explore(path, child)
|
13
|
+
end
|
14
|
+
|
15
|
+
node.remove if node.text.nil? || node.text.strip == ''
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveMenus
|
3
|
+
def process page
|
4
|
+
page.css('nav').remove
|
5
|
+
page.css('.pane').remove
|
6
|
+
page.css('.carousel').remove
|
7
|
+
|
8
|
+
page.css('ul').each do |ul|
|
9
|
+
li_count = ul.css('li').count
|
10
|
+
a_count = ul.xpath('./a[@href]').count
|
11
|
+
ul.remove if a_count >= li_count.to_f/2
|
12
|
+
end
|
13
|
+
|
14
|
+
explore(page.name, page.at('body'))
|
15
|
+
|
16
|
+
page
|
17
|
+
end
|
18
|
+
|
19
|
+
def explore path, node
|
20
|
+
|
21
|
+
path += "/#{node.name}"
|
22
|
+
|
23
|
+
node.children.each do |child|
|
24
|
+
explore(path, child)
|
25
|
+
end
|
26
|
+
|
27
|
+
return if node.name == 'p'
|
28
|
+
|
29
|
+
links_count = node.xpath('./a').size
|
30
|
+
if links_count > 0 && links_count.to_f/node.children.count > 0.3
|
31
|
+
node.remove
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Webtractor::Filters
|
2
|
+
class RemoveSmallest
|
3
|
+
def process page
|
4
|
+
explore(page.name, page.at('body'))
|
5
|
+
page
|
6
|
+
end
|
7
|
+
|
8
|
+
def explore path, node
|
9
|
+
path += "/#{node.name}"
|
10
|
+
words = (node.text || '').split
|
11
|
+
|
12
|
+
node.children.each do |child|
|
13
|
+
explore(path, child)
|
14
|
+
end
|
15
|
+
|
16
|
+
node.remove if words.count < node.children.count
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/webtractor.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'cachy'
|
4
|
+
require 'moneta'
|
5
|
+
|
6
|
+
require 'webtractor/extractor'
|
7
|
+
require 'webtractor/result'
|
8
|
+
require 'webtractor/filters/filter_group'
|
9
|
+
require 'webtractor/filters/default_filter'
|
10
|
+
require 'webtractor/filters/remove_scripts'
|
11
|
+
require 'webtractor/filters/remove_styles'
|
12
|
+
require 'webtractor/filters/remove_images'
|
13
|
+
require 'webtractor/filters/remove_forms'
|
14
|
+
require 'webtractor/filters/remove_tables'
|
15
|
+
require 'webtractor/filters/remove_comments'
|
16
|
+
require 'webtractor/filters/remove_noncontent'
|
17
|
+
require 'webtractor/filters/remove_menus'
|
18
|
+
require 'webtractor/filters/remove_footer'
|
19
|
+
require 'webtractor/filters/remove_embeds'
|
20
|
+
require 'webtractor/filters/remove_smallest'
|
21
|
+
require 'webtractor/filters/remove_empty'
|
22
|
+
require 'webtractor/filters/remove_attrs'
|
23
|
+
require 'webtractor/filters/biggest_block'
|
24
|
+
|
25
|
+
begin
|
26
|
+
Cachy.cache_store
|
27
|
+
rescue RuntimeError
|
28
|
+
Cachy.cache_store = Moneta.new(:File, dir: '/tmp/webtractor.cache')
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webtractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rene Klacan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: cachy
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: moneta
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: The Webtractor library can extract main content from websites like news,
|
56
|
+
blogs, etc without unwanted boilerplate (menus, footer, comments)
|
57
|
+
email:
|
58
|
+
- rene@klacan.sk
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- LICENSE
|
64
|
+
- README.md
|
65
|
+
- lib/webtractor.rb
|
66
|
+
- lib/webtractor/extractor.rb
|
67
|
+
- lib/webtractor/filters/biggest_block.rb
|
68
|
+
- lib/webtractor/filters/default_filter.rb
|
69
|
+
- lib/webtractor/filters/filter_group.rb
|
70
|
+
- lib/webtractor/filters/remove_attrs.rb
|
71
|
+
- lib/webtractor/filters/remove_comments.rb
|
72
|
+
- lib/webtractor/filters/remove_embeds.rb
|
73
|
+
- lib/webtractor/filters/remove_empty.rb
|
74
|
+
- lib/webtractor/filters/remove_footer.rb
|
75
|
+
- lib/webtractor/filters/remove_forms.rb
|
76
|
+
- lib/webtractor/filters/remove_images.rb
|
77
|
+
- lib/webtractor/filters/remove_menus.rb
|
78
|
+
- lib/webtractor/filters/remove_noncontent.rb
|
79
|
+
- lib/webtractor/filters/remove_noncontent_elements.rb
|
80
|
+
- lib/webtractor/filters/remove_scripts.rb
|
81
|
+
- lib/webtractor/filters/remove_smallest.rb
|
82
|
+
- lib/webtractor/filters/remove_styles.rb
|
83
|
+
- lib/webtractor/filters/remove_tables.rb
|
84
|
+
- lib/webtractor/result.rb
|
85
|
+
- lib/webtractor/version.rb
|
86
|
+
homepage: https://github.com/reneklacan/webtractor
|
87
|
+
licenses:
|
88
|
+
- MIT
|
89
|
+
metadata: {}
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
require_paths:
|
93
|
+
- lib
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '1.9'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 2.2.2
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: The Webtractor library can extract main content from websites like news,
|
110
|
+
blogs, etc without unwanted boilerplate (menus, footer, comments)
|
111
|
+
test_files: []
|
112
|
+
has_rdoc:
|