webtractor 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjAwMDM5NDJhMzg4ODdkNThkZWQ5NmRiNGQzMzM0NDNhNjljMTkyZA==
4
+ MGJlMDlhZDk0NmI2NTkxZGYwMTBjZjA0MGI1OGI1ODY5N2YzZDMxZQ==
5
5
  data.tar.gz: !binary |-
6
- MzkzOWM4YTgyMDg1YWJlZmJiMmQxZTRlYWQ5ZTZmNDAwYmI2MjYzYw==
6
+ OTRmZjhmY2NmZmJlYjQ4ZjU5NTkzZDU4M2E0ZDJiYTQ2MzQ1Y2Y3YQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MDNhNTY1MzI2NDVjMzVmMzRmN2NjYTNiYTYxZDI3NjQ0Y2Y4NjllZmVkYTJh
10
- ZmY5NzkyOWQ1YmNkZWRjMmE1OTM0MTBhMTU4NDU0MDA2MDI3NmI0MjEyY2I5
11
- Yjg3YjZlYjEzMmE2YTdjM2MzOTM2ZjY0NTBkZjI3ZWU5YmJkNGY=
9
+ YzhiMmU4M2YyN2NmNGU3YmYyYjYyMzRiYjFjMjNjYTY3ZTg3ZmQzOTdkMzA4
10
+ M2MzMDI4MmIzNDA5YjEyYmRhY2Y2ZTZhZTI2ZjczZDYzMTc2NzRiMmMxOTBi
11
+ OWI0MTVhZWYwZjM3Y2JiOGJkOWRmOWMxYWUwYzEyNWMyYmJmMjI=
12
12
  data.tar.gz: !binary |-
13
- YjMzMWRjY2QyNDk2OTBlNjU1OTBmMDlhNzQ1NTdkYWQ5ZWFlNjljOWMxZjg1
14
- ZThhZGY0MWU2ODRlOWY2YTg0MGEyNjUzZTRmM2FjZDhmZThlYWM0YjgwZTAw
15
- ODRmZWMzOTViZTVmMWRmM2Q5ZGI5YjJmOTA0Mzk3MzczYTg4YmU=
13
+ YTJjYzM5YTI0NTMyNzliOTkxOWEwYTQzYTRjYjVmMDI1N2YyOGM5Njc1ODE1
14
+ NzBkNWFjMzVkNjE2YjdkOTkwMDc5ZTkxMDJkM2QyOWNmNWRkODY4NTg1OTQy
15
+ MWE4MWEzNTEzNTc4Y2IwNjA1MWI0NjQ4OWNhOWE1Mzc2ODEwZjU=
@@ -1,4 +1,5 @@
1
1
  require 'open-uri'
2
+ require 'mechanize'
2
3
  require 'nokogiri'
3
4
  require 'cachy'
4
5
  require 'moneta'
@@ -3,6 +3,7 @@ module Webtractor
3
3
  attr_accessor :filters
4
4
 
5
5
  def initialize params={}
6
+ @agent = params[:agent] || Mechanize.new
6
7
  @filters = params[:filters] || [Filters::DefaultFilter.new]
7
8
  @cache = params[:cache] || false
8
9
  @cache_params = params[:cache_params] || {}
@@ -14,15 +15,16 @@ module Webtractor
14
15
 
15
16
  def extract_from_xml page
16
17
  title = page.xpath('//head/title').text
18
+ body = page.at('body')
17
19
  @filters.each do |filter|
18
- page = filter.process(page)
20
+ body = filter.process(body)
19
21
  end
20
- Result.new(title, page)
22
+ Result.new(title, body)
21
23
  end
22
24
 
23
25
  def extract_from_url url
24
26
  content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
25
- open(url).read
27
+ @agent.get(url).content
26
28
  end
27
29
  extract(content)
28
30
  end
@@ -6,7 +6,7 @@ module Webtractor::Filters
6
6
 
7
7
  def process page
8
8
  @nodes = {}
9
- explore(page.name, page.at('body'))
9
+ explore(page.name, page)
10
10
  @nodes = Hash[@nodes.sort.reverse]
11
11
 
12
12
  max = @nodes.keys[0]
@@ -1,7 +1,7 @@
1
1
  module Webtractor::Filters
2
2
  class RemoveEmpty
3
3
  def process page
4
- explore(page.name, page.at('body'))
4
+ explore(page.name, page)
5
5
  page
6
6
  end
7
7
 
@@ -12,7 +12,9 @@ module Webtractor::Filters
12
12
  explore(path, child)
13
13
  end
14
14
 
15
- node.remove if node.text.nil? || node.text.strip == ''
15
+ empty = node.text.nil? || node.text.strip == ''
16
+ hidden = node['class'] && node['class'].include?('hidden')
17
+ node.remove if empty || hidden
16
18
  end
17
19
  end
18
20
  end
@@ -15,7 +15,7 @@ module Webtractor::Filters
15
15
  ul.remove if a_count >= li_count.to_f/2
16
16
  end
17
17
 
18
- explore(page.name, page.at('body'))
18
+ explore(page.name, page)
19
19
 
20
20
  page
21
21
  end
@@ -1,7 +1,7 @@
1
1
  module Webtractor::Filters
2
2
  class RemoveSmallest
3
3
  def process page
4
- explore(page.name, page.at('body'))
4
+ explore(page.name, page)
5
5
  page
6
6
  end
7
7
 
@@ -1,3 +1,3 @@
1
1
  module Webtractor
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
@@ -10,6 +10,20 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2014-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -52,6 +66,20 @@ dependencies:
52
66
  - - ! '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: jazz_hands
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: The Webtractor library can extract main content from websites like news,
56
84
  blogs, etc without unwanted boilerplate (menus, footer, comments)
57
85
  email: