webtractor 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/webtractor.rb +1 -0
- data/lib/webtractor/extractor.rb +5 -3
- data/lib/webtractor/filters/biggest_block.rb +1 -1
- data/lib/webtractor/filters/remove_empty.rb +4 -2
- data/lib/webtractor/filters/remove_menus.rb +1 -1
- data/lib/webtractor/filters/remove_smallest.rb +1 -1
- data/lib/webtractor/version.rb +1 -1
- metadata +29 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MGJlMDlhZDk0NmI2NTkxZGYwMTBjZjA0MGI1OGI1ODY5N2YzZDMxZQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTRmZjhmY2NmZmJlYjQ4ZjU5NTkzZDU4M2E0ZDJiYTQ2MzQ1Y2Y3YQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YzhiMmU4M2YyN2NmNGU3YmYyYjYyMzRiYjFjMjNjYTY3ZTg3ZmQzOTdkMzA4
|
10
|
+
M2MzMDI4MmIzNDA5YjEyYmRhY2Y2ZTZhZTI2ZjczZDYzMTc2NzRiMmMxOTBi
|
11
|
+
OWI0MTVhZWYwZjM3Y2JiOGJkOWRmOWMxYWUwYzEyNWMyYmJmMjI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YTJjYzM5YTI0NTMyNzliOTkxOWEwYTQzYTRjYjVmMDI1N2YyOGM5Njc1ODE1
|
14
|
+
NzBkNWFjMzVkNjE2YjdkOTkwMDc5ZTkxMDJkM2QyOWNmNWRkODY4NTg1OTQy
|
15
|
+
MWE4MWEzNTEzNTc4Y2IwNjA1MWI0NjQ4OWNhOWE1Mzc2ODEwZjU=
|
data/lib/webtractor.rb
CHANGED
data/lib/webtractor/extractor.rb
CHANGED
@@ -3,6 +3,7 @@ module Webtractor
|
|
3
3
|
attr_accessor :filters
|
4
4
|
|
5
5
|
def initialize params={}
|
6
|
+
@agent = params[:agent] || Mechanize.new
|
6
7
|
@filters = params[:filters] || [Filters::DefaultFilter.new]
|
7
8
|
@cache = params[:cache] || false
|
8
9
|
@cache_params = params[:cache_params] || {}
|
@@ -14,15 +15,16 @@ module Webtractor
|
|
14
15
|
|
15
16
|
def extract_from_xml page
|
16
17
|
title = page.xpath('//head/title').text
|
18
|
+
body = page.at('body')
|
17
19
|
@filters.each do |filter|
|
18
|
-
|
20
|
+
body = filter.process(body)
|
19
21
|
end
|
20
|
-
Result.new(title,
|
22
|
+
Result.new(title, body)
|
21
23
|
end
|
22
24
|
|
23
25
|
def extract_from_url url
|
24
26
|
content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
|
25
|
-
|
27
|
+
@agent.get(url).content
|
26
28
|
end
|
27
29
|
extract(content)
|
28
30
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Webtractor::Filters
|
2
2
|
class RemoveEmpty
|
3
3
|
def process page
|
4
|
-
explore(page.name, page
|
4
|
+
explore(page.name, page)
|
5
5
|
page
|
6
6
|
end
|
7
7
|
|
@@ -12,7 +12,9 @@ module Webtractor::Filters
|
|
12
12
|
explore(path, child)
|
13
13
|
end
|
14
14
|
|
15
|
-
|
15
|
+
empty = node.text.nil? || node.text.strip == ''
|
16
|
+
hidden = node['class'] && node['class'].include?('hidden')
|
17
|
+
node.remove if empty || hidden
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
data/lib/webtractor/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webtractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
@@ -10,6 +10,20 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2014-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: nokogiri
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +66,20 @@ dependencies:
|
|
52
66
|
- - ! '>='
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: jazz_hands
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
55
83
|
description: The Webtractor library can extract main content from websites like news,
|
56
84
|
blogs, etc without unwanted boilerplate (menus, footer, comments)
|
57
85
|
email:
|