content_scrapper 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/content_scrapper.gemspec +2 -2
- data/lib/content_scrapper.rb +3 -1
- data/lib/content_scrapper/content_mapping.rb +1 -1
- data/test/test_content_scrapper.rb +3 -1
- data/test/test_pages.rb +0 -1
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.10
|
data/content_scrapper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.10"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
12
|
-
s.date = %q{2010-03-
|
12
|
+
s.date = %q{2010-03-12}
|
13
13
|
s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
|
14
14
|
s.email = %q{gyorgy.frivolt@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/content_scrapper.rb
CHANGED
@@ -59,7 +59,9 @@ class ContentScrapper
|
|
59
59
|
doc = Nokogiri::HTML(options[:use_page] || Kernel.open(url))
|
60
60
|
return content_mapping.scrap_content(doc, content_scrapper = self)
|
61
61
|
rescue Exception
|
62
|
-
|
62
|
+
unless @scrapping_exception_handler_block.nil?
|
63
|
+
@scrapping_exception_handler_block.call($!, url)
|
64
|
+
end
|
63
65
|
return nil
|
64
66
|
end
|
65
67
|
end
|
@@ -30,7 +30,7 @@ class ContentMapping
|
|
30
30
|
content_section = doc.xpath(content_xpath)
|
31
31
|
content = content_section.to_a.join("\n")
|
32
32
|
content = content_scrapper.clean_content(content) unless content_scrapper.nil?
|
33
|
-
content = Iconv.conv(
|
33
|
+
content = Iconv.conv(iconv_to, iconv_from, content) unless iconv_to.nil?
|
34
34
|
return content if content_section.count > 0
|
35
35
|
end
|
36
36
|
nil
|
@@ -169,13 +169,15 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
169
169
|
setup do
|
170
170
|
Kernel.expects(:open).raises(Exception, 'something failed')
|
171
171
|
@exception_handle_flag = nil
|
172
|
-
@scrapper.rescue_scrapping do |exception|
|
172
|
+
@scrapper.rescue_scrapping do |exception, url|
|
173
173
|
@exception_handle_flag = exception.message
|
174
|
+
@exception_url = url
|
174
175
|
end
|
175
176
|
end
|
176
177
|
should "catch the exception and handle it" do
|
177
178
|
assert_nil @scrapper.scrap_content('http://www.pretty.url')
|
178
179
|
assert_equal 'something failed', @exception_handle_flag
|
180
|
+
assert_equal 'http://www.pretty.url', @exception_url
|
179
181
|
end
|
180
182
|
end
|
181
183
|
|
data/test/test_pages.rb
CHANGED
@@ -21,7 +21,6 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
21
21
|
Kernel.expects(:open).returns(StringIO.new(cdata_content))
|
22
22
|
end
|
23
23
|
should "not escape the cdata entries, should leave cdata unvisible" do
|
24
|
-
#<!--<![CDATA[
|
25
24
|
assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
|
26
25
|
end
|
27
26
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-03-
|
12
|
+
date: 2010-03-12 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|