content_scrapper 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/content_scrapper.gemspec +2 -2
- data/lib/content_scrapper.rb +2 -2
- data/lib/content_scrapper/feedzirra.rb +5 -4
- data/test/test_content_scrapper.rb +31 -0
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.8
|
data/content_scrapper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-03-09}
|
13
13
|
s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
|
14
14
|
s.email = %q{gyorgy.frivolt@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/content_scrapper.rb
CHANGED
@@ -51,12 +51,12 @@ class ContentScrapper
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
def scrap_content(url)
|
54
|
+
def scrap_content(url, full_page = nil)
|
55
55
|
content_mappings.each do | content_mapping |
|
56
56
|
if content_mapping.matches_url?(url)
|
57
57
|
return nil if content_mapping.content_xpaths_list.empty?
|
58
58
|
begin
|
59
|
-
doc = Nokogiri::HTML(Kernel.open(url))
|
59
|
+
doc = Nokogiri::HTML(full_page || Kernel.open(url))
|
60
60
|
return content_mapping.scrap_content(doc, content_scrapper = self)
|
61
61
|
rescue Exception
|
62
62
|
@scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
|
@@ -1,17 +1,18 @@
|
|
1
1
|
# feedzirra entries are extended by methods for scrapping content
|
2
2
|
require 'feedzirra/feed_entry_utilities'
|
3
|
+
require 'ruby-debug'
|
3
4
|
|
4
5
|
module Feedzirra
|
5
6
|
module FeedEntryUtilities
|
6
7
|
|
7
8
|
# Scrap the content based on the URL and the existing content and return it
|
8
|
-
def scrap_content(scrapper = ContentScrapper.default)
|
9
|
-
scrapper.scrap_content(self.url) || self.content.to_s
|
9
|
+
def scrap_content(scrapper = ContentScrapper.default, full_page = nil)
|
10
|
+
scrapper.scrap_content(self.url, full_page = full_page) || self.content.to_s
|
10
11
|
end
|
11
12
|
|
12
13
|
# Scrap the content or use the existing one and change the feed entry
|
13
|
-
def scrap_content!(scrapper = ContentScrapper.default)
|
14
|
-
content = scrap_content(scrapper)
|
14
|
+
def scrap_content!(scrapper = ContentScrapper.default, full_page = nil)
|
15
|
+
self.content = scrap_content(scrapper, full_page = full_page)
|
15
16
|
end
|
16
17
|
end
|
17
18
|
end
|
@@ -107,6 +107,18 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
107
107
|
should("not match enything") { assert_nil @entry_content }
|
108
108
|
end
|
109
109
|
|
110
|
+
context "on already downloaded document" do
|
111
|
+
setup do
|
112
|
+
pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
113
|
+
Kernel.expects(:open).never
|
114
|
+
@scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
|
115
|
+
full_page = pretty_content)
|
116
|
+
end
|
117
|
+
should "scrap from the provided full page" do
|
118
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
110
122
|
context "on scrapping with feedzirra" do
|
111
123
|
setup do
|
112
124
|
require 'content_scrapper/feedzirra'
|
@@ -132,6 +144,25 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
132
144
|
end
|
133
145
|
end
|
134
146
|
end
|
147
|
+
|
148
|
+
context "on feed entry with url and scrapping with full_page" do
|
149
|
+
setup do
|
150
|
+
@feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
|
151
|
+
@feed_entries.each do |feed_entry|
|
152
|
+
feed_entry.url = 'http://www.pretty.url/wedhsf'
|
153
|
+
end
|
154
|
+
@pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
155
|
+
Kernel.expects(:open).never
|
156
|
+
end
|
157
|
+
should("return the original feed content") do
|
158
|
+
@feed_entries.each do |feed_entry|
|
159
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
160
|
+
feed_entry.scrap_content(@scrapper, full_page = @pretty_content))
|
161
|
+
feed_entry.scrap_content!(@scrapper, full_page = @pretty_content)
|
162
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
135
166
|
end
|
136
167
|
|
137
168
|
context "on failing scrapping" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-09 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|