content_scrapper 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.8
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.7"
8
+ s.version = "0.0.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
12
- s.date = %q{2010-02-28}
12
+ s.date = %q{2010-03-09}
13
13
  s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
14
14
  s.email = %q{gyorgy.frivolt@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -51,12 +51,12 @@ class ContentScrapper
51
51
  end
52
52
  end
53
53
 
54
- def scrap_content(url)
54
+ def scrap_content(url, full_page = nil)
55
55
  content_mappings.each do | content_mapping |
56
56
  if content_mapping.matches_url?(url)
57
57
  return nil if content_mapping.content_xpaths_list.empty?
58
58
  begin
59
- doc = Nokogiri::HTML(Kernel.open(url))
59
+ doc = Nokogiri::HTML(full_page || Kernel.open(url))
60
60
  return content_mapping.scrap_content(doc, content_scrapper = self)
61
61
  rescue Exception
62
62
  @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
@@ -1,17 +1,18 @@
1
1
  # feedzirra entries are extended by methods for scrapping content
2
2
  require 'feedzirra/feed_entry_utilities'
3
+ require 'ruby-debug'
3
4
 
4
5
  module Feedzirra
5
6
  module FeedEntryUtilities
6
7
 
7
8
  # Scrap the content based on the URL and the existing content and return it
8
- def scrap_content(scrapper = ContentScrapper.default)
9
- scrapper.scrap_content(self.url) || self.content.to_s
9
+ def scrap_content(scrapper = ContentScrapper.default, full_page = nil)
10
+ scrapper.scrap_content(self.url, full_page = full_page) || self.content.to_s
10
11
  end
11
12
 
12
13
  # Scrap the content or use the existing one and change the feed entry
13
- def scrap_content!(scrapper = ContentScrapper.default)
14
- content = scrap_content(scrapper)
14
+ def scrap_content!(scrapper = ContentScrapper.default, full_page = nil)
15
+ self.content = scrap_content(scrapper, full_page = full_page)
15
16
  end
16
17
  end
17
18
  end
@@ -107,6 +107,18 @@ class TestContentScrapper < Test::Unit::TestCase
107
107
  should("not match enything") { assert_nil @entry_content }
108
108
  end
109
109
 
110
+ context "on already downloaded document" do
111
+ setup do
112
+ pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
113
+ Kernel.expects(:open).never
114
+ @scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
115
+ full_page = pretty_content)
116
+ end
117
+ should "scrap from the provided full page" do
118
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
119
+ end
120
+ end
121
+
110
122
  context "on scrapping with feedzirra" do
111
123
  setup do
112
124
  require 'content_scrapper/feedzirra'
@@ -132,6 +144,25 @@ class TestContentScrapper < Test::Unit::TestCase
132
144
  end
133
145
  end
134
146
  end
147
+
148
+ context "on feed entry with url and scrapping with full_page" do
149
+ setup do
150
+ @feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
151
+ @feed_entries.each do |feed_entry|
152
+ feed_entry.url = 'http://www.pretty.url/wedhsf'
153
+ end
154
+ @pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
155
+ Kernel.expects(:open).never
156
+ end
157
+ should("return the original feed content") do
158
+ @feed_entries.each do |feed_entry|
159
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
160
+ feed_entry.scrap_content(@scrapper, full_page = @pretty_content))
161
+ feed_entry.scrap_content!(@scrapper, full_page = @pretty_content)
162
+ assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
163
+ end
164
+ end
165
+ end
135
166
  end
136
167
 
137
168
  context "on failing scrapping" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-28 00:00:00 +01:00
12
+ date: 2010-03-09 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency