webtractor 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
4
+ MGJlMDlhZDk0NmI2NTkxZGYwMTBjZjA0MGI1OGI1ODY5N2YzZDMxZQ==
5
5
  data.tar.gz: !binary |-
6
- MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
6
+ OTRmZjhmY2NmZmJlYjQ4ZjU5NTkzZDU4M2E0ZDJiYTQ2MzQ1Y2Y3YQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
10
- ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
11
- Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
9
+ YzhiMmU4M2YyN2NmNGU3YmYyYjYyMzRiYjFjMjNjYTY3ZTg3ZmQzOTdkMzA4
10
+ M2MzMDI4MmIzNDA5YjEyYmRhY2Y2ZTZhZTI2ZjczZDYzMTc2NzRiMmMxOTBi
11
+ OWI0MTVhZWYwZjM3Y2JiOGJkOWRmOWMxYWUwYzEyNWMyYmJmMjI=
12
12
  data.tar.gz: !binary |-
13
- YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
14
- ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
15
- ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
13
+ YTJjYzM5YTI0NTMyNzliOTkxOWEwYTQzYTRjYjVmMDI1N2YyOGM5Njc1ODE1
14
+ NzBkNWFjMzVkNjE2YjdkOTkwMDc5ZTkxMDJkM2QyOWNmNWRkODY4NTg1OTQy
15
+ MWE4MWEzNTEzNTc4Y2IwNjA1MWI0NjQ4OWNhOWE1Mzc2ODEwZjU=
@@ -1,4 +1,5 @@
1
1
  require 'open-uri'
2
+ require 'mechanize'
2
3
  require 'nokogiri'
3
4
  require 'cachy'
4
5
  require 'moneta'
@@ -3,6 +3,7 @@ module Webtractor
3
3
  attr_accessor :filters
4
4
 
5
5
  def initialize params={}
6
+ @agent = params[:agent] || Mechanize.new
6
7
  @filters = params[:filters] || [Filters::DefaultFilter.new]
7
8
  @cache = params[:cache] || false
8
9
  @cache_params = params[:cache_params] || {}
@@ -14,15 +15,16 @@ module Webtractor
14
15
 
15
16
  def extract_from_xml page
16
17
  title = page.xpath('//head/title').text
18
+ body = page.at('body')
17
19
  @filters.each do |filter|
18
- page = filter.process(page)
20
+ body = filter.process(body)
19
21
  end
20
- Result.new(title, page)
22
+ Result.new(title, body)
21
23
  end
22
24
 
23
25
  def extract_from_url url
24
26
  content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
25
- open(url).read
27
+ @agent.get(url).content
26
28
  end
27
29
  extract(content)
28
30
  end
@@ -6,7 +6,7 @@ module Webtractor::Filters
6
6
 
7
7
  def process page
8
8
  @nodes = {}
9
- explore(page.name, page.at('body'))
9
+ explore(page.name, page)
10
10
  @nodes = Hash[@nodes.sort.reverse]
11
11
 
12
12
  max = @nodes.keys[0]
@@ -1,7 +1,7 @@
1
1
  module Webtractor::Filters
2
2
  class RemoveEmpty
3
3
  def process page
4
- explore(page.name, page.at('body'))
4
+ explore(page.name, page)
5
5
  page
6
6
  end
7
7
 
@@ -12,7 +12,9 @@ module Webtractor::Filters
12
12
  explore(path, child)
13
13
  end
14
14
 
15
- node.remove if node.text.nil? || node.text.strip == ''
15
+ empty = node.text.nil? || node.text.strip == ''
16
+ hidden = node['class'] && node['class'].include?('hidden')
17
+ node.remove if empty || hidden
16
18
  end
17
19
  end
18
20
  end
@@ -15,7 +15,7 @@ module Webtractor::Filters
15
15
  ul.remove if a_count >= li_count.to_f/2
16
16
  end
17
17
 
18
- explore(page.name, page.at('body'))
18
+ explore(page.name, page)
19
19
 
20
20
  page
21
21
  end
@@ -1,7 +1,7 @@
1
1
  module Webtractor::Filters
2
2
  class RemoveSmallest
3
3
  def process page
4
- explore(page.name, page.at('body'))
4
+ explore(page.name, page)
5
5
  page
6
6
  end
7
7
 
@@ -1,3 +1,3 @@
1
1
  module Webtractor
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
@@ -10,6 +10,20 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2014-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -52,6 +66,20 @@ dependencies:
52
66
  - - ! '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: jazz_hands
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: The Webtractor library can extract main content from websites like news,
56
84
  blogs, etc without unwanted boilerplate (menus, footer, comments)
57
85
  email: