content_scrapper 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.specification +114 -0
- data/VERSION +1 -1
- data/content_scrapper.gemspec +3 -2
- data/lib/content_scrapper/content_mapping.rb +4 -2
- data/lib/content_scrapper.rb +14 -8
- data/test/test_content_mapping.rb +2 -1
- data/test/test_content_scrapper.rb +35 -3
- metadata +3 -2
data/.specification
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: content_scrapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gyorgy Frivolt
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-13 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: thoughtbot-shoulda
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.10.2
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mocha
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.8
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: sanitize
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.2.0
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: nokogiri
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.4.1
|
54
|
+
version:
|
55
|
+
description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
|
56
|
+
email: gyorgy.frivolt@gmail.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- .document
|
66
|
+
- .gitignore
|
67
|
+
- LICENSE
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- VERSION
|
71
|
+
- config/content_scrapper.rb
|
72
|
+
- content_scrapper.gemspec
|
73
|
+
- lib/content_scrapper.rb
|
74
|
+
- lib/content_scrapper/content_mapping.rb
|
75
|
+
- lib/content_scrapper/feedzirra.rb
|
76
|
+
- rails/init.rb
|
77
|
+
- test/helper.rb
|
78
|
+
- test/test_content_mapping.rb
|
79
|
+
- test/test_content_scrapper.rb
|
80
|
+
- test/test_pages/pretty.html
|
81
|
+
- test/test_pages/twocontent.html
|
82
|
+
- test/test_pages/ugly.html
|
83
|
+
has_rdoc: true
|
84
|
+
homepage: http://github.com/fifigyuri/content_scrapper
|
85
|
+
licenses: []
|
86
|
+
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options:
|
89
|
+
- --charset=UTF-8
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "0"
|
97
|
+
version:
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: "0"
|
103
|
+
version:
|
104
|
+
requirements: []
|
105
|
+
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 1.3.5
|
108
|
+
signing_key:
|
109
|
+
specification_version: 3
|
110
|
+
summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
|
111
|
+
test_files:
|
112
|
+
- test/test_content_mapping.rb
|
113
|
+
- test/test_content_scrapper.rb
|
114
|
+
- test/helper.rb
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
data/content_scrapper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
12
|
-
s.date = %q{2010-02-
|
12
|
+
s.date = %q{2010-02-22}
|
13
13
|
s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
|
14
14
|
s.email = %q{gyorgy.frivolt@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
".gitignore",
|
22
|
+
".specification",
|
22
23
|
"LICENSE",
|
23
24
|
"README.rdoc",
|
24
25
|
"Rakefile",
|
@@ -19,10 +19,12 @@ class ContentMapping
|
|
19
19
|
url =~ @url_pattern_regexp
|
20
20
|
end
|
21
21
|
|
22
|
-
def scrap_content(doc)
|
22
|
+
def scrap_content(doc, content_scrapper = nil)
|
23
23
|
@content_xpaths_list.each do |content_xpath|
|
24
24
|
content_section = doc.xpath(content_xpath)
|
25
|
-
|
25
|
+
content = content_section.to_a.join("\n")
|
26
|
+
content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
|
27
|
+
return content if content_section.count > 0
|
26
28
|
end
|
27
29
|
nil
|
28
30
|
end
|
data/lib/content_scrapper.rb
CHANGED
@@ -24,7 +24,7 @@ class ContentScrapper
|
|
24
24
|
|
25
25
|
def initialize(scrapper_config_file = nil)
|
26
26
|
@content_mappings = []
|
27
|
-
config_file = ContentScrapper.default_config_file
|
27
|
+
config_file = scrapper_config_file || ContentScrapper.default_config_file
|
28
28
|
self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
|
29
29
|
end
|
30
30
|
|
@@ -34,8 +34,8 @@ class ContentScrapper
|
|
34
34
|
@content_mappings << new_mapping
|
35
35
|
end
|
36
36
|
|
37
|
-
def sanitize_tags(
|
38
|
-
@sanitize_settings =
|
37
|
+
def sanitize_tags(&block)
|
38
|
+
@sanitize_settings = block.call()
|
39
39
|
end
|
40
40
|
|
41
41
|
def scrap_content(url)
|
@@ -44,17 +44,23 @@ class ContentScrapper
|
|
44
44
|
return nil if content_mapping.content_xpaths_list.empty?
|
45
45
|
begin
|
46
46
|
doc = Nokogiri::HTML(Kernel.open(url))
|
47
|
-
|
48
|
-
return nil if content.nil?
|
49
|
-
return Sanitize.clean(content, sanitize_settings)
|
47
|
+
return content_mapping.scrap_content(doc, content_scrapper = self)
|
50
48
|
rescue Exception
|
51
|
-
|
49
|
+
@scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
|
50
|
+
return nil
|
52
51
|
end
|
53
52
|
end
|
54
53
|
end
|
54
|
+
@missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
|
55
55
|
nil
|
56
56
|
end
|
57
57
|
|
58
|
-
def
|
58
|
+
def rescue_scrapping(&block)
|
59
|
+
@scrapping_exception_handler_block = block
|
60
|
+
end
|
61
|
+
|
62
|
+
def missing_url_matcher(&block)
|
63
|
+
@missing_url_matcher_handler_block = block
|
59
64
|
end
|
60
65
|
end
|
66
|
+
|
@@ -26,7 +26,8 @@ class TestContentMapping < Test::Unit::TestCase
|
|
26
26
|
@document = Nokogiri::HTML(pretty_content)
|
27
27
|
end
|
28
28
|
should "extract the content" do
|
29
|
-
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
29
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
30
|
+
@mapping.scrap_content(@document))
|
30
31
|
end
|
31
32
|
end
|
32
33
|
context "on document with two content parts" do
|
@@ -5,7 +5,7 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
5
5
|
|
6
6
|
ContentScrapper.default_config_file = nil
|
7
7
|
|
8
|
-
context "on common
|
8
|
+
context "on common settings" do
|
9
9
|
setup do
|
10
10
|
@scrapper = ContentScrapper.new
|
11
11
|
@scrapper.instance_eval do
|
@@ -35,8 +35,10 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
35
35
|
content_at '//div[@id="never_should_be_here"]'
|
36
36
|
end
|
37
37
|
|
38
|
-
sanitize_tags
|
39
|
-
|
38
|
+
sanitize_tags do
|
39
|
+
{:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
|
40
|
+
:attributes => { 'a' => ['href'] }}
|
41
|
+
end
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
@@ -113,6 +115,34 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
113
115
|
end
|
114
116
|
end
|
115
117
|
end
|
118
|
+
|
119
|
+
context "on failing scrapping" do
|
120
|
+
setup do
|
121
|
+
Kernel.expects(:open).raises(Exception, 'something failed')
|
122
|
+
@exception_handle_flag = nil
|
123
|
+
@scrapper.rescue_scrapping do |exception|
|
124
|
+
@exception_handle_flag = exception.message
|
125
|
+
end
|
126
|
+
end
|
127
|
+
should "catch the exception and handle it" do
|
128
|
+
assert_nil @scrapper.scrap_content('http://www.pretty.url')
|
129
|
+
assert_equal 'something failed', @exception_handle_flag
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context "on missing url matcher" do
|
134
|
+
setup do
|
135
|
+
Kernel.expects(:open).never
|
136
|
+
@missing_url_matcher_flag = nil
|
137
|
+
@scrapper.missing_url_matcher do |url|
|
138
|
+
@missing_url_matcher_flag = url
|
139
|
+
end
|
140
|
+
@scrapper.scrap_content('http://missing.url.matcher')
|
141
|
+
end
|
142
|
+
should "call the handler block" do
|
143
|
+
assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
|
144
|
+
end
|
145
|
+
end
|
116
146
|
end
|
117
147
|
|
118
148
|
context "on setting default content scrapper" do
|
@@ -126,6 +156,7 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
126
156
|
assert_equal @new_scrapper, ContentScrapper.default
|
127
157
|
end
|
128
158
|
end
|
159
|
+
|
129
160
|
context "for feed entry" do
|
130
161
|
setup do
|
131
162
|
@feed_entry = Feedzirra::Parser::RSSEntry.new
|
@@ -138,3 +169,4 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
138
169
|
end
|
139
170
|
end
|
140
171
|
end
|
172
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-22 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -64,6 +64,7 @@ extra_rdoc_files:
|
|
64
64
|
files:
|
65
65
|
- .document
|
66
66
|
- .gitignore
|
67
|
+
- .specification
|
67
68
|
- LICENSE
|
68
69
|
- README.rdoc
|
69
70
|
- Rakefile
|