content_scrapper 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.8
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.7"
8
+ s.version = "0.0.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
12
- s.date = %q{2010-02-28}
12
+ s.date = %q{2010-03-09}
13
13
  s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
14
14
  s.email = %q{gyorgy.frivolt@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -51,12 +51,12 @@ class ContentScrapper
51
51
  end
52
52
  end
53
53
 
54
- def scrap_content(url)
54
+ def scrap_content(url, full_page = nil)
55
55
  content_mappings.each do | content_mapping |
56
56
  if content_mapping.matches_url?(url)
57
57
  return nil if content_mapping.content_xpaths_list.empty?
58
58
  begin
59
- doc = Nokogiri::HTML(Kernel.open(url))
59
+ doc = Nokogiri::HTML(full_page || Kernel.open(url))
60
60
  return content_mapping.scrap_content(doc, content_scrapper = self)
61
61
  rescue Exception
62
62
  @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
@@ -1,17 +1,18 @@
1
1
  # feedzirra entries are extended by methods for scrapping content
2
2
  require 'feedzirra/feed_entry_utilities'
3
+ require 'ruby-debug'
3
4
 
4
5
  module Feedzirra
5
6
  module FeedEntryUtilities
6
7
 
7
8
  # Scrap the content based on the URL and the existing content and return it
8
- def scrap_content(scrapper = ContentScrapper.default)
9
- scrapper.scrap_content(self.url) || self.content.to_s
9
+ def scrap_content(scrapper = ContentScrapper.default, full_page = nil)
10
+ scrapper.scrap_content(self.url, full_page = full_page) || self.content.to_s
10
11
  end
11
12
 
12
13
  # Scrap the content or use the existing one and change the feed entry
13
- def scrap_content!(scrapper = ContentScrapper.default)
14
- content = scrap_content(scrapper)
14
+ def scrap_content!(scrapper = ContentScrapper.default, full_page = nil)
15
+ self.content = scrap_content(scrapper, full_page = full_page)
15
16
  end
16
17
  end
17
18
  end
@@ -107,6 +107,18 @@ class TestContentScrapper < Test::Unit::TestCase
107
107
  should("not match enything") { assert_nil @entry_content }
108
108
  end
109
109
 
110
+ context "on already downloaded document" do
111
+ setup do
112
+ pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
113
+ Kernel.expects(:open).never
114
+ @scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
115
+ full_page = pretty_content)
116
+ end
117
+ should "scrap from the provided full page" do
118
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
119
+ end
120
+ end
121
+
110
122
  context "on scrapping with feedzirra" do
111
123
  setup do
112
124
  require 'content_scrapper/feedzirra'
@@ -132,6 +144,25 @@ class TestContentScrapper < Test::Unit::TestCase
132
144
  end
133
145
  end
134
146
  end
147
+
148
+ context "on feed entry with url and scrapping with full_page" do
149
+ setup do
150
+ @feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
151
+ @feed_entries.each do |feed_entry|
152
+ feed_entry.url = 'http://www.pretty.url/wedhsf'
153
+ end
154
+ @pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
155
+ Kernel.expects(:open).never
156
+ end
157
+ should("return the original feed content") do
158
+ @feed_entries.each do |feed_entry|
159
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
160
+ feed_entry.scrap_content(@scrapper, full_page = @pretty_content))
161
+ feed_entry.scrap_content!(@scrapper, full_page = @pretty_content)
162
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
163
+ end
164
+ end
165
+ end
135
166
  end
136
167
 
137
168
  context "on failing scrapping" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-28 00:00:00 +01:00
12
+ date: 2010-03-09 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency