content_scrapper 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{content_scrapper}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Gyorgy Frivolt"]
@@ -40,15 +40,13 @@ class ContentScrapper
40
40
 
41
41
  def scrap_content(url)
42
42
  content_mappings.each do | content_mapping |
43
- if content_mapping.matches_url?(url) and !content_mapping.content_xpaths_list.empty?
43
+ if content_mapping.matches_url?(url)
44
+ return nil if content_mapping.content_xpaths_list.empty?
44
45
  begin
45
46
  doc = Nokogiri::HTML(Kernel.open(url))
46
47
  content = content_mapping.scrap_content(doc)
47
- if content.nil?
48
- return nil
49
- else
50
- return Sanitize.clean(content, sanitize_settings)
51
- end
48
+ return nil if content.nil?
49
+ return Sanitize.clean(content, sanitize_settings)
52
50
  rescue Exception
53
51
  scrap_content_exception($!)
54
52
  end
data/rails/init.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  require 'content_scrapper'
2
2
 
3
- ContentScrapper.default_config_file = "#{RAILS_ROOT}/config/content_scrapper.yml"
3
+ ContentScrapper.default_config_file = "#{RAILS_ROOT}/config/content_scrapper.rb"
@@ -26,6 +26,15 @@ class TestContentScrapper < Test::Unit::TestCase
26
26
  content_at '//div[@id="itext_second_content"]'
27
27
  end
28
28
 
29
+ content_mapping do
30
+ url_pattern /^http:\/\/www\.skipper\.url/
31
+ end
32
+
33
+ content_mapping do
34
+ url_pattern /^http:\/\/www\.skipper\.url/
35
+ content_at '//div[@id="never_should_be_here"]'
36
+ end
37
+
29
38
  sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
30
39
  :attributes => { 'a' => ['href'] }})
31
40
  end
@@ -70,6 +79,14 @@ class TestContentScrapper < Test::Unit::TestCase
70
79
  end
71
80
  end
72
81
 
82
+ context "skipper patterns" do
83
+ setup do
84
+ Kernel.expects(:open).with('http://www.skipper.url/fdgsw').never
85
+ @entry_content = @scrapper.scrap_content('http://www.skipper.url/fdgsw')
86
+ end
87
+ should("not match enything") { assert_nil @entry_content }
88
+ end
89
+
73
90
  context "on scrapping with feedzirra" do
74
91
  setup do
75
92
  require 'content_scrapper/feedzirra'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gyorgy Frivolt