webtractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZWVkNmUwOWUwYTdjYmMyMDYwZWVjOWNhNWE1MGYyZmJmNGNjMzRkNw==
5
+ data.tar.gz: !binary |-
6
+ ODlmZWZiNDFjZjg0ZDI4ODRjYjk2ZWVkZDY0YjBkNDMxY2VlMDk3Nw==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ YTk4NzRiNTkxNDVkZmJhNDkxZDFkNmUwODVkZThmYjc1MDA3MDk2ZjZlMTg1
10
+ YjBmNzM2NjU5NGU2Y2RjYjkzNzRkZmEwZDcwZTk1NzUyNzVkNThlOTBjMGNi
11
+ MGJhOWY5YWRiNjhlOTc3OTFhMGMxY2IxMTFhY2QwNDVjZGRlNjI=
12
+ data.tar.gz: !binary |-
13
+ OWNlOTdjMTg2MDA3YzhhYzkwMDE3OTU0NjUzZDllZDY5M2FkY2NjOWZjYjAx
14
+ ZTk2MDJmODc5OWQ0ZGZjZjIzYzA4YmU4NTQ4MWY3M2E4ZTg3NjY0ODE4ZjM1
15
+ YTlmMWZiOWIxNzlmZWI2YzY5MzllMDVmZTFhNzJlMzZkZjViODk=
data/LICENSE ADDED
@@ -0,0 +1,8 @@
1
+ /*
2
+ * ----------------------------------------------------------------------------
3
+ * "THE BEER-WARE LICENSE" (Revision 42):
4
+ * As long as you retain this notice you can do whatever you want with this
5
+ * stuff. If we meet some day, and you think this stuff is worth it, you can
6
+ * buy me a beer in return. Rene Klacan
7
+ * ----------------------------------------------------------------------------
8
+ */
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # Webtractor
2
+
3
+ The Webtractor is a ruby library which is able to extract main content
4
+ from webpages like news, blogs, etc. As a result you can just a main
5
+ content without any boilerplate (menu, footer, comments, etc).
6
+
7
+ ## Installation
8
+
9
+ You can install it directly via gem:
10
+
11
+ ```
12
+ gem install webtractor
13
+ ```
14
+
15
+ Or you can put it in your Gemfile:
16
+
17
+ ```ruby
18
+ gem 'webtractor'
19
+ ```
20
+
21
+ Then run:
22
+
23
+ ```
24
+ bundle install
25
+ ```
26
+
27
+ ## Basic usage
28
+
29
+ ```ruby
30
+ extractor = Webtractor::Extractor.new
31
+ result = extractor.extract_from_url
32
+ 'http://techcrunch.com/2014/05/24/dont-believe-anyone-who-tells-you-learning-to-code-is-easy/'
33
+ puts result.text
34
+ ```
35
+
36
+ Or
37
+
38
+ ```ruby
39
+ extractor = Webtractor::Extractor.new
40
+ result = extractor.extract '<html><body>...</body></html>'
41
+ ```
42
+
43
+ Or
44
+
45
+ ```ruby
46
+ page = Nokogiri::HTML(...)
47
+ extractor = Webtractor::Extractor.new
48
+ result = extractor.extract_from_xml page
49
+ ```
50
+
51
+ You can also access Nokogiri document from result via xml attribute:
52
+
53
+ ```ruby
54
+ puts result.xml.xpath('...').text
55
+ ```
56
+
57
+ ## Advanced usage
58
+
59
+ Process of getting main content from the webpage is really simple. It
60
+ consists of applying multiple filters on the document where every filter
61
+ gets on input output of the last applied filter.
62
+
63
+ You can look at the names of default filters:
64
+
65
+ ```ruby
66
+ p Webtractor::Filters::DefaultFilter.new.filters.map{|f| f.class.to_s}
67
+ ```
68
+
69
+ You can remove any filter:
70
+
71
+ ```ruby
72
+ extractor.remove_filter Webtractor::Filters::RemoveComments
73
+ ```
74
+
75
+ Or you can also create your own filter. It can be any class which
76
+ implements *process* method which takes page as an argument and returns
77
+ page:
78
+
79
+ ```ruby
80
+ class RemoveBolds
81
+ def process page
82
+ page.css('b').remove
83
+ page
84
+ end
85
+ end
86
+
87
+ extractor.add_filter RemoveBolds.new
88
+ ```
89
+
90
+ ## License
91
+
92
+ This library is distributed under the Bearware license.
@@ -0,0 +1,47 @@
1
+ module Webtractor
2
+ class Extractor
3
+ attr_accessor :filters
4
+
5
+ def initialize params={}
6
+ @filters = params[:filters] || [Filters::DefaultFilter.new]
7
+ @cache = params[:cache] || false
8
+ @cache_params = params[:cache_params] || {}
9
+ end
10
+
11
+ def extract text
12
+ extract_from_xml(Nokogiri::HTML(text))
13
+ end
14
+
15
+ def extract_from_xml page
16
+ title = page.xpath('//head/title').text
17
+ @filters.each do |filter|
18
+ page = filter.process(page)
19
+ end
20
+ Result.new(title, page)
21
+ end
22
+
23
+ def extract_from_url url
24
+ content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
25
+ open(url).read
26
+ end
27
+ extract(content)
28
+ end
29
+
30
+ def add_filter filter
31
+ if filter.is_a?(Class)
32
+ @filters << filter.new
33
+ else
34
+ @filters << filter
35
+ end
36
+ end
37
+
38
+ def remove_filter filter
39
+ filter = filter.class unless filter.is_a?(Class)
40
+ @filters = @filters.reject!{|f| f.is_a?(filter)}
41
+ end
42
+
43
+ def clear_filters
44
+ @filters.clear
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,37 @@
1
+ module Webtractor::Filters
2
+ class BiggestBlock
3
+ def initialize threshold=50.0
4
+ @threshold = threshold
5
+ end
6
+
7
+ def process page
8
+ @nodes = {}
9
+ explore(page.name, page.at('body'))
10
+ @nodes = Hash[@nodes.sort.reverse]
11
+
12
+ max = @nodes.keys[0]
13
+ last_percents = 100.0
14
+ last_node = @nodes.values[0]
15
+
16
+ @nodes.each do |size, node|
17
+ percents = size.to_f/max*100
18
+ diff = last_percents - percents
19
+ return last_node if diff > @threshold
20
+ last_percents = percents
21
+ last_node = node
22
+ end
23
+ page
24
+ end
25
+
26
+ def explore path, node
27
+ path += "/#{node.name}"
28
+ size = node.text ? node.text.size : 0
29
+
30
+ @nodes[size] = node
31
+
32
+ node.children.each do |child|
33
+ explore(path, child)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,22 @@
1
+ module Webtractor::Filters
2
+ class DefaultFilter < FilterGroup
3
+ def filters
4
+ [
5
+ RemoveScripts.new,
6
+ RemoveStyles.new,
7
+ RemoveImages.new,
8
+ RemoveForms.new,
9
+ RemoveTables.new,
10
+ RemoveComments.new,
11
+ RemoveNoncontent.new,
12
+ RemoveMenus.new,
13
+ RemoveFooter.new,
14
+ RemoveEmbeds.new,
15
+ RemoveSmallest.new,
16
+ RemoveEmpty.new,
17
+ RemoveAttrs.new,
18
+ BiggestBlock.new,
19
+ ]
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,18 @@
1
+ module Webtractor::Filters
2
+ class FilterGroup
3
+ def initialize fs=nil
4
+ @filters = fs || filters
5
+ end
6
+
7
+ def filters
8
+ []
9
+ end
10
+
11
+ def process page
12
+ @filters.each do |filter|
13
+ page = filter.process(page)
14
+ end
15
+ page
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ module Webtractor::Filters
2
+ class RemoveAttrs
3
+ def process page
4
+ page.css('*').each do |a|
5
+ a.attributes.each do |attr, value|
6
+ a.attributes[attr].remove
7
+ end
8
+ end
9
+ page
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveComments
3
+ def process page
4
+ page.xpath('//*[contains(@class, "comment")]').remove
5
+ page.xpath('//*[contains(@id, "comment")]').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveEmbeds
3
+ def process page
4
+ page.css('embed').remove
5
+ page.css('object').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ module Webtractor::Filters
2
+ class RemoveEmpty
3
+ def process page
4
+ explore(page.name, page.at('body'))
5
+ page
6
+ end
7
+
8
+ def explore path, node
9
+ path += "/#{node.name}"
10
+
11
+ node.children.each do |child|
12
+ explore(path, child)
13
+ end
14
+
15
+ node.remove if node.text.nil? || node.text.strip == ''
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveFooter
3
+ def process page
4
+ page.css('footer').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveForms
3
+ def process page
4
+ page.css('form').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveImages
3
+ def process page
4
+ page.css('img').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,35 @@
1
+ module Webtractor::Filters
2
+ class RemoveMenus
3
+ def process page
4
+ page.css('nav').remove
5
+ page.css('.pane').remove
6
+ page.css('.carousel').remove
7
+
8
+ page.css('ul').each do |ul|
9
+ li_count = ul.css('li').count
10
+ a_count = ul.xpath('./a[@href]').count
11
+ ul.remove if a_count >= li_count.to_f/2
12
+ end
13
+
14
+ explore(page.name, page.at('body'))
15
+
16
+ page
17
+ end
18
+
19
+ def explore path, node
20
+
21
+ path += "/#{node.name}"
22
+
23
+ node.children.each do |child|
24
+ explore(path, child)
25
+ end
26
+
27
+ return if node.name == 'p'
28
+
29
+ links_count = node.xpath('./a').size
30
+ if links_count > 0 && links_count.to_f/node.children.count > 0.3
31
+ node.remove
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveNoncontent
3
+ def process page
4
+ page.css('br').remove
5
+ page.css('hr').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveNoncontentElements
3
+ def process page
4
+ page.css('br').remove
5
+ page.css('hr').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Webtractor::Filters
2
+ class RemoveScripts
3
+ def process page
4
+ page.css('script').remove
5
+ page.css('noscript').remove
6
+ page
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module Webtractor::Filters
2
+ class RemoveSmallest
3
+ def process page
4
+ explore(page.name, page.at('body'))
5
+ page
6
+ end
7
+
8
+ def explore path, node
9
+ path += "/#{node.name}"
10
+ words = (node.text || '').split
11
+
12
+ node.children.each do |child|
13
+ explore(path, child)
14
+ end
15
+
16
+ node.remove if words.count < node.children.count
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveStyles
3
+ def process page
4
+ page.css('style').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module Webtractor::Filters
2
+ class RemoveTables
3
+ def process page
4
+ page.css('table').remove
5
+ page
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ module Webtractor
2
+ class Result
3
+ attr_accessor :title, :text, :xml
4
+
5
+ def initialize title, xml
6
+ @title = title
7
+ @text = xml.text
8
+ @xml = xml
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Webtractor
2
+ VERSION = '0.0.1'
3
+ end
data/lib/webtractor.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'cachy'
4
+ require 'moneta'
5
+
6
+ require 'webtractor/extractor'
7
+ require 'webtractor/result'
8
+ require 'webtractor/filters/filter_group'
9
+ require 'webtractor/filters/default_filter'
10
+ require 'webtractor/filters/remove_scripts'
11
+ require 'webtractor/filters/remove_styles'
12
+ require 'webtractor/filters/remove_images'
13
+ require 'webtractor/filters/remove_forms'
14
+ require 'webtractor/filters/remove_tables'
15
+ require 'webtractor/filters/remove_comments'
16
+ require 'webtractor/filters/remove_noncontent'
17
+ require 'webtractor/filters/remove_menus'
18
+ require 'webtractor/filters/remove_footer'
19
+ require 'webtractor/filters/remove_embeds'
20
+ require 'webtractor/filters/remove_smallest'
21
+ require 'webtractor/filters/remove_empty'
22
+ require 'webtractor/filters/remove_attrs'
23
+ require 'webtractor/filters/biggest_block'
24
+
25
+ begin
26
+ Cachy.cache_store
27
+ rescue RuntimeError
28
+ Cachy.cache_store = Moneta.new(:File, dir: '/tmp/webtractor.cache')
29
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webtractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Rene Klacan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: cachy
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: moneta
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: The Webtractor library can extract main content from websites like news,
56
+ blogs, etc without unwanted boilerplate (menus, footer, comments)
57
+ email:
58
+ - rene@klacan.sk
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - LICENSE
64
+ - README.md
65
+ - lib/webtractor.rb
66
+ - lib/webtractor/extractor.rb
67
+ - lib/webtractor/filters/biggest_block.rb
68
+ - lib/webtractor/filters/default_filter.rb
69
+ - lib/webtractor/filters/filter_group.rb
70
+ - lib/webtractor/filters/remove_attrs.rb
71
+ - lib/webtractor/filters/remove_comments.rb
72
+ - lib/webtractor/filters/remove_embeds.rb
73
+ - lib/webtractor/filters/remove_empty.rb
74
+ - lib/webtractor/filters/remove_footer.rb
75
+ - lib/webtractor/filters/remove_forms.rb
76
+ - lib/webtractor/filters/remove_images.rb
77
+ - lib/webtractor/filters/remove_menus.rb
78
+ - lib/webtractor/filters/remove_noncontent.rb
79
+ - lib/webtractor/filters/remove_noncontent_elements.rb
80
+ - lib/webtractor/filters/remove_scripts.rb
81
+ - lib/webtractor/filters/remove_smallest.rb
82
+ - lib/webtractor/filters/remove_styles.rb
83
+ - lib/webtractor/filters/remove_tables.rb
84
+ - lib/webtractor/result.rb
85
+ - lib/webtractor/version.rb
86
+ homepage: https://github.com/reneklacan/webtractor
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '1.9'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.2.2
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: The Webtractor library can extract main content from websites like news,
110
+ blogs, etc without unwanted boilerplate (menus, footer, comments)
111
+ test_files: []
112
+ has_rdoc: