content_scrapper 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.specification +114 -0
- data/VERSION +1 -1
- data/content_scrapper.gemspec +3 -2
- data/lib/content_scrapper/content_mapping.rb +4 -2
- data/lib/content_scrapper.rb +14 -8
- data/test/test_content_mapping.rb +2 -1
- data/test/test_content_scrapper.rb +35 -3
- metadata +3 -2
data/.specification
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: content_scrapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gyorgy Frivolt
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-13 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: thoughtbot-shoulda
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.10.2
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mocha
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.8
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: sanitize
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.2.0
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: nokogiri
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.4.1
|
54
|
+
version:
|
55
|
+
description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
|
56
|
+
email: gyorgy.frivolt@gmail.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- .document
|
66
|
+
- .gitignore
|
67
|
+
- LICENSE
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- VERSION
|
71
|
+
- config/content_scrapper.rb
|
72
|
+
- content_scrapper.gemspec
|
73
|
+
- lib/content_scrapper.rb
|
74
|
+
- lib/content_scrapper/content_mapping.rb
|
75
|
+
- lib/content_scrapper/feedzirra.rb
|
76
|
+
- rails/init.rb
|
77
|
+
- test/helper.rb
|
78
|
+
- test/test_content_mapping.rb
|
79
|
+
- test/test_content_scrapper.rb
|
80
|
+
- test/test_pages/pretty.html
|
81
|
+
- test/test_pages/twocontent.html
|
82
|
+
- test/test_pages/ugly.html
|
83
|
+
has_rdoc: true
|
84
|
+
homepage: http://github.com/fifigyuri/content_scrapper
|
85
|
+
licenses: []
|
86
|
+
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options:
|
89
|
+
- --charset=UTF-8
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "0"
|
97
|
+
version:
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: "0"
|
103
|
+
version:
|
104
|
+
requirements: []
|
105
|
+
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 1.3.5
|
108
|
+
signing_key:
|
109
|
+
specification_version: 3
|
110
|
+
summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
|
111
|
+
test_files:
|
112
|
+
- test/test_content_mapping.rb
|
113
|
+
- test/test_content_scrapper.rb
|
114
|
+
- test/helper.rb
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
data/content_scrapper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
12
|
-
s.date = %q{2010-02-
|
12
|
+
s.date = %q{2010-02-22}
|
13
13
|
s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
|
14
14
|
s.email = %q{gyorgy.frivolt@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
".gitignore",
|
22
|
+
".specification",
|
22
23
|
"LICENSE",
|
23
24
|
"README.rdoc",
|
24
25
|
"Rakefile",
|
@@ -19,10 +19,12 @@ class ContentMapping
|
|
19
19
|
url =~ @url_pattern_regexp
|
20
20
|
end
|
21
21
|
|
22
|
-
def scrap_content(doc)
|
22
|
+
def scrap_content(doc, content_scrapper = nil)
|
23
23
|
@content_xpaths_list.each do |content_xpath|
|
24
24
|
content_section = doc.xpath(content_xpath)
|
25
|
-
|
25
|
+
content = content_section.to_a.join("\n")
|
26
|
+
content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
|
27
|
+
return content if content_section.count > 0
|
26
28
|
end
|
27
29
|
nil
|
28
30
|
end
|
data/lib/content_scrapper.rb
CHANGED
@@ -24,7 +24,7 @@ class ContentScrapper
|
|
24
24
|
|
25
25
|
def initialize(scrapper_config_file = nil)
|
26
26
|
@content_mappings = []
|
27
|
-
config_file = ContentScrapper.default_config_file
|
27
|
+
config_file = scrapper_config_file || ContentScrapper.default_config_file
|
28
28
|
self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
|
29
29
|
end
|
30
30
|
|
@@ -34,8 +34,8 @@ class ContentScrapper
|
|
34
34
|
@content_mappings << new_mapping
|
35
35
|
end
|
36
36
|
|
37
|
-
def sanitize_tags(
|
38
|
-
@sanitize_settings =
|
37
|
+
def sanitize_tags(&block)
|
38
|
+
@sanitize_settings = block.call()
|
39
39
|
end
|
40
40
|
|
41
41
|
def scrap_content(url)
|
@@ -44,17 +44,23 @@ class ContentScrapper
|
|
44
44
|
return nil if content_mapping.content_xpaths_list.empty?
|
45
45
|
begin
|
46
46
|
doc = Nokogiri::HTML(Kernel.open(url))
|
47
|
-
|
48
|
-
return nil if content.nil?
|
49
|
-
return Sanitize.clean(content, sanitize_settings)
|
47
|
+
return content_mapping.scrap_content(doc, content_scrapper = self)
|
50
48
|
rescue Exception
|
51
|
-
|
49
|
+
@scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
|
50
|
+
return nil
|
52
51
|
end
|
53
52
|
end
|
54
53
|
end
|
54
|
+
@missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
|
55
55
|
nil
|
56
56
|
end
|
57
57
|
|
58
|
-
def
|
58
|
+
def rescue_scrapping(&block)
|
59
|
+
@scrapping_exception_handler_block = block
|
60
|
+
end
|
61
|
+
|
62
|
+
def missing_url_matcher(&block)
|
63
|
+
@missing_url_matcher_handler_block = block
|
59
64
|
end
|
60
65
|
end
|
66
|
+
|
@@ -26,7 +26,8 @@ class TestContentMapping < Test::Unit::TestCase
|
|
26
26
|
@document = Nokogiri::HTML(pretty_content)
|
27
27
|
end
|
28
28
|
should "extract the content" do
|
29
|
-
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
29
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
30
|
+
@mapping.scrap_content(@document))
|
30
31
|
end
|
31
32
|
end
|
32
33
|
context "on document with two content parts" do
|
@@ -5,7 +5,7 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
5
5
|
|
6
6
|
ContentScrapper.default_config_file = nil
|
7
7
|
|
8
|
-
context "on common
|
8
|
+
context "on common settings" do
|
9
9
|
setup do
|
10
10
|
@scrapper = ContentScrapper.new
|
11
11
|
@scrapper.instance_eval do
|
@@ -35,8 +35,10 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
35
35
|
content_at '//div[@id="never_should_be_here"]'
|
36
36
|
end
|
37
37
|
|
38
|
-
sanitize_tags
|
39
|
-
|
38
|
+
sanitize_tags do
|
39
|
+
{:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
|
40
|
+
:attributes => { 'a' => ['href'] }}
|
41
|
+
end
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
@@ -113,6 +115,34 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
113
115
|
end
|
114
116
|
end
|
115
117
|
end
|
118
|
+
|
119
|
+
context "on failing scrapping" do
|
120
|
+
setup do
|
121
|
+
Kernel.expects(:open).raises(Exception, 'something failed')
|
122
|
+
@exception_handle_flag = nil
|
123
|
+
@scrapper.rescue_scrapping do |exception|
|
124
|
+
@exception_handle_flag = exception.message
|
125
|
+
end
|
126
|
+
end
|
127
|
+
should "catch the exception and handle it" do
|
128
|
+
assert_nil @scrapper.scrap_content('http://www.pretty.url')
|
129
|
+
assert_equal 'something failed', @exception_handle_flag
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context "on missing url matcher" do
|
134
|
+
setup do
|
135
|
+
Kernel.expects(:open).never
|
136
|
+
@missing_url_matcher_flag = nil
|
137
|
+
@scrapper.missing_url_matcher do |url|
|
138
|
+
@missing_url_matcher_flag = url
|
139
|
+
end
|
140
|
+
@scrapper.scrap_content('http://missing.url.matcher')
|
141
|
+
end
|
142
|
+
should "call the handler block" do
|
143
|
+
assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
|
144
|
+
end
|
145
|
+
end
|
116
146
|
end
|
117
147
|
|
118
148
|
context "on setting default content scrapper" do
|
@@ -126,6 +156,7 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
126
156
|
assert_equal @new_scrapper, ContentScrapper.default
|
127
157
|
end
|
128
158
|
end
|
159
|
+
|
129
160
|
context "for feed entry" do
|
130
161
|
setup do
|
131
162
|
@feed_entry = Feedzirra::Parser::RSSEntry.new
|
@@ -138,3 +169,4 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
138
169
|
end
|
139
170
|
end
|
140
171
|
end
|
172
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-22 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -64,6 +64,7 @@ extra_rdoc_files:
|
|
64
64
|
files:
|
65
65
|
- .document
|
66
66
|
- .gitignore
|
67
|
+
- .specification
|
67
68
|
- LICENSE
|
68
69
|
- README.rdoc
|
69
70
|
- Rakefile
|