content_scrapper 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
@@ -40,15 +40,13 @@ class ContentScrapper
40
40
 
41
41
  def scrap_content(url)
42
42
  content_mappings.each do | content_mapping |
43
- if content_mapping.matches_url?(url) and !content_mapping.content_xpaths_list.empty?
43
+ if content_mapping.matches_url?(url)
44
+ return nil if content_mapping.content_xpaths_list.empty?
44
45
  begin
45
46
  doc = Nokogiri::HTML(Kernel.open(url))
46
47
  content = content_mapping.scrap_content(doc)
47
- if content.nil?
48
- return nil
49
- else
50
- return Sanitize.clean(content, sanitize_settings)
51
- end
48
+ return nil if content.nil?
49
+ return Sanitize.clean(content, sanitize_settings)
52
50
  rescue Exception
53
51
  scrap_content_exception($!)
54
52
  end
data/rails/init.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  require 'content_scrapper'
2
2
 
3
- ContentScrapper.default_config_file = "#{RAILS_ROOT}/config/content_scrapper.yml"
3
+ ContentScrapper.default_config_file = "#{RAILS_ROOT}/config/content_scrapper.rb"
@@ -26,6 +26,15 @@ class TestContentScrapper < Test::Unit::TestCase
26
26
  content_at '//div[@id="itext_second_content"]'
27
27
  end
28
28
 
29
+ content_mapping do
30
+ url_pattern /^http:\/\/www\.skipper\.url/
31
+ end
32
+
33
+ content_mapping do
34
+ url_pattern /^http:\/\/www\.skipper\.url/
35
+ content_at '//div[@id="never_should_be_here"]'
36
+ end
37
+
29
38
  sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
30
39
  :attributes => { 'a' => ['href'] }})
31
40
  end
@@ -70,6 +79,14 @@ class TestContentScrapper < Test::Unit::TestCase
70
79
  end
71
80
  end
72
81
 
82
+ context "skipper patterns" do
83
+ setup do
84
+ Kernel.expects(:open).with('http://www.skipper.url/fdgsw').never
85
+ @entry_content = @scrapper.scrap_content('http://www.skipper.url/fdgsw')
86
+ end
87
+ should("not match enything") { assert_nil @entry_content }
88
+ end
89
+
73
90
  context "on scrapping with feedzirra" do
74
91
  setup do
75
92
  require 'content_scrapper/feedzirra'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt