webtractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
5
+ data.tar.gz: !binary |-
6
+ ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
10
+ YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
11
+ MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
12
+ data.tar.gz: !binary |-
13
+ OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
14
+ ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
15
+ YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
data/LICENSE ADDED
@@ -0,0 +1,8 @@
1
+ /*
2
+ * ----------------------------------------------------------------------------
3
+ * "THE BEER-WARE LICENSE" (Revision 42):
4
+ * As long as you retain this notice you can do whatever you want with this
5
+ * stuff. If we meet some day, and you think this stuff is worth it, you can
6
+ * buy me a beer in return. Rene Klacan
7
+ * ----------------------------------------------------------------------------
8
+ */
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # Webtractor
2
+
3
+ The Webtractor is a ruby library which is able to extract main content
4
+ from webpages like news, blogs, etc. As a result you can just a main
5
+ content without any boilerplate (menu, footer, comments, etc).
6
+
7
+ ## Installation
8
+
9
+ You can install it directly via gem:
10
+
11
+ ```
12
+ gem install webtractor
13
+ ```
14
+
15
+ Or you can put it in your Gemfile:
16
+
17
+ ```ruby
18
+ gem 'webtractor'
19
+ ```
20
+
21
+ Then run:
22
+
23
+ ```
24
+ bundle install
25
+ ```
26
+
27
+ ## Basic usage
28
+
29
+ ```ruby
30
+ extractor = Webtractor::Extractor.new
31
+ result = extractor.extract_from_url
32
+ 'http://techcrunch.com/2014/05/24/dont-believe-anyone-who-tells-you-learning-to-code-is-easy/'
33
+ puts result.text
34
+ ```
35
+
36
+ Or
37
+
38
+ ```ruby
39
+ extractor = Webtractor::Extractor.new
40
+ result = extractor.extract '<html><body>...</body></html>'
41
+ ```
42
+
43
+ Or
44
+
45
+ ```ruby
46
+ page = Nokogiri::HTML(...)
47
+ extractor = Webtractor::Extractor.new
48
+ result = extractor.extract_from_xml page
49
+ ```
50
+
51
+ You can also access Nokogiri document from result via xml attribute:
52
+
53
+ ```ruby
54
+ puts result.xml.xpath('...').text
55
+ ```
56
+
57
+ ## Advanced usage
58
+
59
+ Process of getting main content from the webpage is really simple. It
60
+ consists of applying multiple filters on the document where every filter
61
+ gets on input output of the last applied filter.
62
+
63
+ You can look at the names of default filters:
64
+
65
+ ```ruby
66
+ p Webtractor::Filters::DefaultFilter.new.filters.map{|f| f.class.to_s}
67
+ ```
68
+
69
+ You can remove any filter:
70
+
71
+ ```ruby
72
+ extractor.remove_filter Webtractor::Filters::RemoveComments
73
+ ```
74
+
75
+ Or you can also create your own filter. It can be any class which
76
+ implements *process* method which takes page as an argument and returns
77
+ page:
78
+
79
+ ```ruby
80
+ class RemoveBolds
81
+ def process page
82
+ page.css('b').remove
83
+ page
84
+ end
85
+ end
86
+
87
+ extractor.add_filter RemoveBolds.new
88
+ ```
89
+
90
+ ## License
91
+
92
+ This library is distributed under the Bearware license.
@@ -0,0 +1,47 @@
1
+ module Webtractor
2
+ class Extractor
3
+ attr_accessor :filters
4
+
5
+ def initialize params={}
6
+ @filters = params[:filters] || [Filters::DefaultFilter.new]
7
+ @cache = params[:cache] || false
8
+ @cache_params = params[:cache_params] || {}
9
+ end
10
+
11
+ def extract text
12
+ extract_from_xml(Nokogiri::HTML(text))
13
+ end
14
+
15
+ def extract_from_xml page
16
+ title = page.xpath('//head/title').text
17
+ @filters.each do |filter|
18
+ page = filter.process(page)
19
+ end
20
+ Result.new(title, page)
21
+ end
22
+
23
+ def extract_from_url url
24
+ content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
25
+ open(url).read
26
+ end
27
+ extract(content)
28
+ end
29
+
30
+ def add_filter filter
31
+ if filter.is_a?(Class)
32
+ @filters << filter.new
33
+ else
34
+ @filters << filter
35
+ end
36
+ end
37
+
38
+ def remove_filter filter
39
+ filter = filter.class unless filter.is_a?(Class)
40
+ @filters = @filters.reject!{|f| f.is_a?(filter)}
41
+ end
42
+
43
+ def clear_filters
44
+ @filters.clear
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,37 @@
1
+ module Webtractor::Filters
2
+ class BiggestBlock
3
+ def initialize threshold=50.0
4
+ @threshold = threshold
5
+ end
6
+
7
+ def process page
8
+ @nodes = {}
9
+ explore(page.name, page.at('body'))
10
+ @nodes = Hash[@nodes.sort.reverse]
11
+
12
+ max = @nodes.keys[0]
13
+ last_percents = 100.0
14
+ last_node = @nodes.values[0]
15
+
16
+ @nodes.each do |size, node|
17
+ percents = size.to_f/max*100
18
+ diff = last_percents - percents
19
+ return last_node if diff > @threshold
20
+ last_percents = percents
21
+ last_node = node
22
+ end
23
+ page
24
+ end
25
+
26
+ def explore path, node
27
+ path += "/#{node.name}"
28
+ size = node.text ? node.text.size : 0
29
+
30
+ @nodes[size] = node
31
+
32
+ node.children.each do |child|
33
+ explore(path, child)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,22 @@
1
+ module Webtractor::Filters
2
+ class DefaultFilter < FilterGroup
3
+ def filters
4
+ [
5
+ RemoveScripts.new,
6
+ RemoveStyles.new,
7
+ RemoveImages.new,
8
+ RemoveForms.new,
9
+ RemoveTables.new,
10
+ RemoveComments.new,
11
+ RemoveNoncontent.new,
12
+ RemoveMenus.new,
13
+ RemoveFooter.new,
14
+ RemoveEmbeds.new,
15
+ RemoveSmallest.new,
16
+ RemoveEmpty.new,
17
+ RemoveAttrs.new,
18
+ BiggestBlock.new,
19
+ ]
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,18 @@
1
+ module Webtractor::Filters
2
+ class FilterGroup
3
+ def initialize fs=nil
4
+ @filters = fs || filters
5
+ end
6
+
7
+ def filters
8
+ []
9
+ end
10
+
11
+ def process page
12
+ @filters.each do |filter|
13
+ page = filter.process(page)
14
+ end
15
+ page
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ module Webtractor::Filters
2
+ class RemoveAttrs
3
+ def process page
4
+ page.css('*').each do |a|
5
+ a.attributes.each do |attr, value|
6
+ a.attributes[attr].remove
7
+ end
8
+ end
9
+ page
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveComments
3
+ def process page
4
+ page.xpath('//*[contains(@class, "comment")]').remove
5
+ page.xpath('//*[contains(@id, "comment")]').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveEmbeds
3
+ def process page
4
+ page.css('embed').remove
5
+ page.css('object').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ module Webtractor::Filters
2
+ class RemoveEmpty
3
+ def process page
4
+ explore(page.name, page.at('body'))
5
+ page
6
+ end
7
+
8
+ def explore path, node
9
+ path += "/#{node.name}"
10
+
11
+ node.children.each do |child|
12
+ explore(path, child)
13
+ end
14
+
15
+ node.remove if node.text.nil? || node.text.strip == ''
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveFooter
3
+ def process page
4
+ page.css('footer').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveForms
3
+ def process page
4
+ page.css('form').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveImages
3
+ def process page
4
+ page.css('img').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,35 @@
1
+ module Webtractor::Filters
2
+ class RemoveMenus
3
+ def process page
4
+ page.css('nav').remove
5
+ page.css('.pane').remove
6
+ page.css('.carousel').remove
7
+
8
+ page.css('ul').each do |ul|
9
+ li_count = ul.css('li').count
10
+ a_count = ul.xpath('./a[@href]').count
11
+ ul.remove if a_count >= li_count.to_f/2
12
+ end
13
+
14
+ explore(page.name, page.at('body'))
15
+
16
+ page
17
+ end
18
+
19
+ def explore path, node
20
+
21
+ path += "/#{node.name}"
22
+
23
+ node.children.each do |child|
24
+ explore(path, child)
25
+ end
26
+
27
+ return if node.name == 'p'
28
+
29
+ links_count = node.xpath('./a').size
30
+ if links_count > 0 && links_count.to_f/node.children.count > 0.3
31
+ node.remove
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveNoncontent
3
+ def process page
4
+ page.css('br').remove
5
+ page.css('hr').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveNoncontentElements
3
+ def process page
4
+ page.css('br').remove
5
+ page.css('hr').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveScripts
3
+ def process page
4
+ page.css('script').remove
5
+ page.css('noscript').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module Webtractor::Filters
2
+ class RemoveSmallest
3
+ def process page
4
+ explore(page.name, page.at('body'))
5
+ page
6
+ end
7
+
8
+ def explore path, node
9
+ path += "/#{node.name}"
10
+ words = (node.text || '').split
11
+
12
+ node.children.each do |child|
13
+ explore(path, child)
14
+ end
15
+
16
+ node.remove if words.count < node.children.count
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveStyles
3
+ def process page
4
+ page.css('style').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveTables
3
+ def process page
4
+ page.css('table').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ module Webtractor
2
+ class Result
3
+ attr_accessor :title, :text, :xml
4
+
5
+ def initialize title, xml
6
+ @title = title
7
+ @text = xml.text
8
+ @xml = xml
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Webtractor
2
+ VERSION = '0.0.1'
3
+ end
data/lib/webtractor.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'cachy'
4
+ require 'moneta'
5
+
6
+ require 'webtractor/extractor'
7
+ require 'webtractor/result'
8
+ require 'webtractor/filters/filter_group'
9
+ require 'webtractor/filters/default_filter'
10
+ require 'webtractor/filters/remove_scripts'
11
+ require 'webtractor/filters/remove_styles'
12
+ require 'webtractor/filters/remove_images'
13
+ require 'webtractor/filters/remove_forms'
14
+ require 'webtractor/filters/remove_tables'
15
+ require 'webtractor/filters/remove_comments'
16
+ require 'webtractor/filters/remove_noncontent'
17
+ require 'webtractor/filters/remove_menus'
18
+ require 'webtractor/filters/remove_footer'
19
+ require 'webtractor/filters/remove_embeds'
20
+ require 'webtractor/filters/remove_smallest'
21
+ require 'webtractor/filters/remove_empty'
22
+ require 'webtractor/filters/remove_attrs'
23
+ require 'webtractor/filters/biggest_block'
24
+
25
+ begin
26
+ Cachy.cache_store
27
+ rescue RuntimeError
28
+ Cachy.cache_store = Moneta.new(:File, dir: '/tmp/webtractor.cache')
29
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webtractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Rene Klacan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: cachy
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: moneta
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: The Webtractor library can extract main content from websites like news,
56
+ blogs, etc without unwanted boilerplate (menus, footer, comments)
57
+ email:
58
+ - rene@klacan.sk
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - LICENSE
64
+ - README.md
65
+ - lib/webtractor.rb
66
+ - lib/webtractor/extractor.rb
67
+ - lib/webtractor/filters/biggest_block.rb
68
+ - lib/webtractor/filters/default_filter.rb
69
+ - lib/webtractor/filters/filter_group.rb
70
+ - lib/webtractor/filters/remove_attrs.rb
71
+ - lib/webtractor/filters/remove_comments.rb
72
+ - lib/webtractor/filters/remove_embeds.rb
73
+ - lib/webtractor/filters/remove_empty.rb
74
+ - lib/webtractor/filters/remove_footer.rb
75
+ - lib/webtractor/filters/remove_forms.rb
76
+ - lib/webtractor/filters/remove_images.rb
77
+ - lib/webtractor/filters/remove_menus.rb
78
+ - lib/webtractor/filters/remove_noncontent.rb
79
+ - lib/webtractor/filters/remove_noncontent_elements.rb
80
+ - lib/webtractor/filters/remove_scripts.rb
81
+ - lib/webtractor/filters/remove_smallest.rb
82
+ - lib/webtractor/filters/remove_styles.rb
83
+ - lib/webtractor/filters/remove_tables.rb
84
+ - lib/webtractor/result.rb
85
+ - lib/webtractor/version.rb
86
+ homepage: https://github.com/reneklacan/webtractor
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '1.9'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.2.2
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: The Webtractor library can extract main content from websites like news,
110
+ blogs, etc without unwanted boilerplate (menus, footer, comments)
111
+ test_files: []
112
+ has_rdoc: