content_scrapper 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/content_scrapper.gemspec +2 -2
- data/lib/content_scrapper.rb +2 -2
- data/lib/content_scrapper/feedzirra.rb +5 -4
- data/test/test_content_scrapper.rb +31 -0
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.8
|
data/content_scrapper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{content_scrapper}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Gyorgy Frivolt"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-03-09}
|
13
13
|
s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
|
14
14
|
s.email = %q{gyorgy.frivolt@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/content_scrapper.rb
CHANGED
@@ -51,12 +51,12 @@ class ContentScrapper
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
def scrap_content(url)
|
54
|
+
def scrap_content(url, full_page = nil)
|
55
55
|
content_mappings.each do | content_mapping |
|
56
56
|
if content_mapping.matches_url?(url)
|
57
57
|
return nil if content_mapping.content_xpaths_list.empty?
|
58
58
|
begin
|
59
|
-
doc = Nokogiri::HTML(Kernel.open(url))
|
59
|
+
doc = Nokogiri::HTML(full_page || Kernel.open(url))
|
60
60
|
return content_mapping.scrap_content(doc, content_scrapper = self)
|
61
61
|
rescue Exception
|
62
62
|
@scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
|
@@ -1,17 +1,18 @@
|
|
1
1
|
# feedzirra entries are extended by methods for scrapping content
|
2
2
|
require 'feedzirra/feed_entry_utilities'
|
3
|
+
require 'ruby-debug'
|
3
4
|
|
4
5
|
module Feedzirra
|
5
6
|
module FeedEntryUtilities
|
6
7
|
|
7
8
|
# Scrap the content based on the URL and the existing content and return it
|
8
|
-
def scrap_content(scrapper = ContentScrapper.default)
|
9
|
-
scrapper.scrap_content(self.url) || self.content.to_s
|
9
|
+
def scrap_content(scrapper = ContentScrapper.default, full_page = nil)
|
10
|
+
scrapper.scrap_content(self.url, full_page = full_page) || self.content.to_s
|
10
11
|
end
|
11
12
|
|
12
13
|
# Scrap the content or use the existing one and change the feed entry
|
13
|
-
def scrap_content!(scrapper = ContentScrapper.default)
|
14
|
-
content = scrap_content(scrapper)
|
14
|
+
def scrap_content!(scrapper = ContentScrapper.default, full_page = nil)
|
15
|
+
self.content = scrap_content(scrapper, full_page = full_page)
|
15
16
|
end
|
16
17
|
end
|
17
18
|
end
|
@@ -107,6 +107,18 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
107
107
|
should("not match enything") { assert_nil @entry_content }
|
108
108
|
end
|
109
109
|
|
110
|
+
context "on already downloaded document" do
|
111
|
+
setup do
|
112
|
+
pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
113
|
+
Kernel.expects(:open).never
|
114
|
+
@scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
|
115
|
+
full_page = pretty_content)
|
116
|
+
end
|
117
|
+
should "scrap from the provided full page" do
|
118
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
110
122
|
context "on scrapping with feedzirra" do
|
111
123
|
setup do
|
112
124
|
require 'content_scrapper/feedzirra'
|
@@ -132,6 +144,25 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
132
144
|
end
|
133
145
|
end
|
134
146
|
end
|
147
|
+
|
148
|
+
context "on feed entry with url and scrapping with full_page" do
|
149
|
+
setup do
|
150
|
+
@feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
|
151
|
+
@feed_entries.each do |feed_entry|
|
152
|
+
feed_entry.url = 'http://www.pretty.url/wedhsf'
|
153
|
+
end
|
154
|
+
@pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
155
|
+
Kernel.expects(:open).never
|
156
|
+
end
|
157
|
+
should("return the original feed content") do
|
158
|
+
@feed_entries.each do |feed_entry|
|
159
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
160
|
+
feed_entry.scrap_content(@scrapper, full_page = @pretty_content))
|
161
|
+
feed_entry.scrap_content!(@scrapper, full_page = @pretty_content)
|
162
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
135
166
|
end
|
136
167
|
|
137
168
|
context "on failing scrapping" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gyorgy Frivolt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-09 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|