content_scrapper 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/content_scrapper.gemspec +1 -1
- data/lib/content_scrapper.rb +2 -2
- data/lib/content_scrapper/feedzirra.rb +4 -4
- data/test/test_content_scrapper.rb +3 -3
- metadata +1 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.9
|
data/content_scrapper.gemspec
CHANGED
data/lib/content_scrapper.rb
CHANGED
@@ -51,12 +51,12 @@ class ContentScrapper
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
def scrap_content(url,
|
54
|
+
def scrap_content(url, options = {})
|
55
55
|
content_mappings.each do | content_mapping |
|
56
56
|
if content_mapping.matches_url?(url)
|
57
57
|
return nil if content_mapping.content_xpaths_list.empty?
|
58
58
|
begin
|
59
|
-
doc = Nokogiri::HTML(
|
59
|
+
doc = Nokogiri::HTML(options[:use_page] || Kernel.open(url))
|
60
60
|
return content_mapping.scrap_content(doc, content_scrapper = self)
|
61
61
|
rescue Exception
|
62
62
|
@scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
|
@@ -6,13 +6,13 @@ module Feedzirra
|
|
6
6
|
module FeedEntryUtilities
|
7
7
|
|
8
8
|
# Scrap the content based on the URL and the existing content and return it
|
9
|
-
def scrap_content(scrapper = ContentScrapper.default,
|
10
|
-
scrapper.scrap_content(self.url,
|
9
|
+
def scrap_content(scrapper = ContentScrapper.default, options = {})
|
10
|
+
scrapper.scrap_content(self.url, options) || self.content.to_s
|
11
11
|
end
|
12
12
|
|
13
13
|
# Scrap the content or use the existing one and change the feed entry
|
14
|
-
def scrap_content!(scrapper = ContentScrapper.default,
|
15
|
-
self.content = scrap_content(scrapper,
|
14
|
+
def scrap_content!(scrapper = ContentScrapper.default, options = {})
|
15
|
+
self.content = scrap_content(scrapper, options)
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -112,7 +112,7 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
112
112
|
pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
113
113
|
Kernel.expects(:open).never
|
114
114
|
@scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
|
115
|
-
|
115
|
+
:use_page => pretty_content)
|
116
116
|
end
|
117
117
|
should "scrap from the provided full page" do
|
118
118
|
assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
|
@@ -157,8 +157,8 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
157
157
|
should("return the original feed content") do
|
158
158
|
@feed_entries.each do |feed_entry|
|
159
159
|
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
160
|
-
feed_entry.scrap_content(@scrapper,
|
161
|
-
feed_entry.scrap_content!(@scrapper,
|
160
|
+
feed_entry.scrap_content(@scrapper, :use_page => @pretty_content))
|
161
|
+
feed_entry.scrap_content!(@scrapper, :use_page => @pretty_content)
|
162
162
|
assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
|
163
163
|
end
|
164
164
|
end
|