content_scrapper 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/content_scrapper.gemspec +1 -1
- data/lib/content_scrapper.rb +4 -6
- data/rails/init.rb +1 -1
- data/test/test_content_scrapper.rb +17 -0
- metadata +1 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/content_scrapper.gemspec
CHANGED
data/lib/content_scrapper.rb
CHANGED
@@ -40,15 +40,13 @@ class ContentScrapper
|
|
40
40
|
|
41
41
|
def scrap_content(url)
|
42
42
|
content_mappings.each do | content_mapping |
|
43
|
-
if content_mapping.matches_url?(url)
|
43
|
+
if content_mapping.matches_url?(url)
|
44
|
+
return nil if content_mapping.content_xpaths_list.empty?
|
44
45
|
begin
|
45
46
|
doc = Nokogiri::HTML(Kernel.open(url))
|
46
47
|
content = content_mapping.scrap_content(doc)
|
47
|
-
if content.nil?
|
48
|
-
|
49
|
-
else
|
50
|
-
return Sanitize.clean(content, sanitize_settings)
|
51
|
-
end
|
48
|
+
return nil if content.nil?
|
49
|
+
return Sanitize.clean(content, sanitize_settings)
|
52
50
|
rescue Exception
|
53
51
|
scrap_content_exception($!)
|
54
52
|
end
|
data/rails/init.rb
CHANGED
@@ -26,6 +26,15 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
26
26
|
content_at '//div[@id="itext_second_content"]'
|
27
27
|
end
|
28
28
|
|
29
|
+
content_mapping do
|
30
|
+
url_pattern /^http:\/\/www\.skipper\.url/
|
31
|
+
end
|
32
|
+
|
33
|
+
content_mapping do
|
34
|
+
url_pattern /^http:\/\/www\.skipper\.url/
|
35
|
+
content_at '//div[@id="never_should_be_here"]'
|
36
|
+
end
|
37
|
+
|
29
38
|
sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
|
30
39
|
:attributes => { 'a' => ['href'] }})
|
31
40
|
end
|
@@ -70,6 +79,14 @@ class TestContentScrapper < Test::Unit::TestCase
|
|
70
79
|
end
|
71
80
|
end
|
72
81
|
|
82
|
+
context "skipper patterns" do
|
83
|
+
setup do
|
84
|
+
Kernel.expects(:open).with('http://www.skipper.url/fdgsw').never
|
85
|
+
@entry_content = @scrapper.scrap_content('http://www.skipper.url/fdgsw')
|
86
|
+
end
|
87
|
+
should("not match enything") { assert_nil @entry_content }
|
88
|
+
end
|
89
|
+
|
73
90
|
context "on scrapping with feedzirra" do
|
74
91
|
setup do
|
75
92
|
require 'content_scrapper/feedzirra'
|