newscrapi 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +23 -0
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/config/content_scrapper.rb +3 -0
- data/doc/classes/ContentMapping.html +242 -0
- data/doc/classes/ContentMapping.src/M000001.html +18 -0
- data/doc/classes/ContentMapping.src/M000002.html +18 -0
- data/doc/classes/ContentMapping.src/M000003.html +18 -0
- data/doc/classes/ContentMapping.src/M000004.html +19 -0
- data/doc/classes/ContentMapping.src/M000005.html +18 -0
- data/doc/classes/ContentMapping.src/M000006.html +25 -0
- data/doc/classes/ContentScrapper.html +297 -0
- data/doc/classes/ContentScrapper.src/M000007.html +18 -0
- data/doc/classes/ContentScrapper.src/M000008.html +18 -0
- data/doc/classes/ContentScrapper.src/M000009.html +20 -0
- data/doc/classes/ContentScrapper.src/M000010.html +20 -0
- data/doc/classes/ContentScrapper.src/M000011.html +18 -0
- data/doc/classes/ContentScrapper.src/M000012.html +21 -0
- data/doc/classes/ContentScrapper.src/M000013.html +21 -0
- data/doc/classes/ContentScrapper.src/M000014.html +33 -0
- data/doc/classes/ContentScrapper.src/M000015.html +18 -0
- data/doc/classes/ContentScrapper.src/M000016.html +18 -0
- data/doc/classes/Feedzirra.html +111 -0
- data/doc/classes/Feedzirra/FeedEntryUtilities.html +152 -0
- data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000017.html +18 -0
- data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000018.html +18 -0
- data/doc/created.rid +1 -0
- data/doc/files/lib/content_scrapper/content_mapping_rb.html +108 -0
- data/doc/files/lib/content_scrapper/feedzirra_rb.html +115 -0
- data/doc/files/lib/content_scrapper_rb.html +112 -0
- data/doc/fr_class_index.html +30 -0
- data/doc/fr_file_index.html +29 -0
- data/doc/fr_method_index.html +44 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/newscrapi.rb +2 -0
- data/lib/newscrapi/encoding.rb +44 -0
- data/lib/newscrapi/feedzirra.rb +17 -0
- data/lib/newscrapi/mapping.rb +50 -0
- data/lib/newscrapi/scrapper.rb +129 -0
- data/lib/newscrapi/testing.rb +19 -0
- data/rails/init.rb +3 -0
- data/test/helper.rb +9 -0
- data/test/test_encoding.rb +43 -0
- data/test/test_mapping.rb +58 -0
- data/test/test_pages.rb +69 -0
- data/test/test_pages/cdata.html +23 -0
- data/test/test_pages/page_without_encoding_meta_tag.html +401 -0
- data/test/test_pages/pretty.html +17 -0
- data/test/test_pages/pretty_missing_content.html +17 -0
- data/test/test_pages/twocontent.html +11 -0
- data/test/test_pages/ugly.html +399 -0
- data/test/test_pages/utf-8_page.html +405 -0
- data/test/test_pages/windows-1250_page.html +460 -0
- data/test/test_scrapper.rb +257 -0
- metadata +191 -0
data/lib/newscrapi.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
require 'rchardet'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'iconv'
|
5
|
+
|
6
|
+
module Newscrapi
|
7
|
+
module Encoding
|
8
|
+
|
9
|
+
def self.guess_html_encoding(obj)
|
10
|
+
doc, page = parse_parameters_doc_page(obj)
|
11
|
+
|
12
|
+
meta_encoding = doc.meta_encoding
|
13
|
+
return meta_encoding unless meta_encoding.nil?
|
14
|
+
CharDet.detect(page)['encoding']
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.get_html_doc_with_changed_encoding(obj, encode_to)
|
18
|
+
doc, page = parse_parameters_doc_page(obj)
|
19
|
+
|
20
|
+
if encode_to
|
21
|
+
guessed_encoding = guess_html_encoding(page)
|
22
|
+
if guessed_encoding != encode_to
|
23
|
+
doc = doc.serialize(:encoding => encode_to)
|
24
|
+
page = doc.to_s
|
25
|
+
return Nokogiri::HTML(page)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
doc
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def self.parse_parameters_doc_page(obj)
|
34
|
+
if (obj.class == String)
|
35
|
+
page = obj
|
36
|
+
doc = Nokogiri::HTML(page)
|
37
|
+
elsif (obj.class == Nokogiri::HTML::Document)
|
38
|
+
doc = obj
|
39
|
+
page = doc.to_s
|
40
|
+
else raise Exception.new("Not supported type #{obj.class.to_s}") end
|
41
|
+
return doc, page
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# feedzirra entries are extended by methods for scrapping content
|
2
|
+
require 'feedzirra/feed_entry_utilities'
|
3
|
+
|
4
|
+
module Feedzirra
|
5
|
+
module FeedEntryUtilities
|
6
|
+
|
7
|
+
# Scrap the content based on the URL and the existing content and return it
|
8
|
+
def scrap_content(scrapper = Newscrapi::Scrapper.default, options = {})
|
9
|
+
scrapper.scrap_content(self.url, options) || self.content.to_s
|
10
|
+
end
|
11
|
+
|
12
|
+
# Scrap the content or use the existing one and change the feed entry
|
13
|
+
def scrap_content!(scrapper = Newscrapi::Scrapper.default, options = {})
|
14
|
+
self.content = scrap_content(scrapper, options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'newscrapi/scrapper'
|
2
|
+
require 'iconv'
|
3
|
+
|
4
|
+
class Newscrapi::Mapping
|
5
|
+
|
6
|
+
attr_reader :content_xpaths_list, :url_pattern_regexp
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@content_xpaths_list = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def url_pattern(pattern)
|
13
|
+
@url_pattern_regexp = pattern.class == String ?
|
14
|
+
Regexp.compile("^#{Regexp.escape(pattern).gsub('\*','.*')}$") : pattern
|
15
|
+
end
|
16
|
+
|
17
|
+
def content_at(content_xpath)
|
18
|
+
@content_xpaths_list << content_xpath
|
19
|
+
end
|
20
|
+
|
21
|
+
def iconv(args)
|
22
|
+
suppose_encoding(args[:from])
|
23
|
+
convert_to(args[:to])
|
24
|
+
end
|
25
|
+
|
26
|
+
=begin
|
27
|
+
def suppose_encoding(encoding = nil)
|
28
|
+
return @supposed_encoding if encoding.nil?
|
29
|
+
@supposed_encoding = encoding
|
30
|
+
end
|
31
|
+
=end
|
32
|
+
|
33
|
+
def matches_url?(url)
|
34
|
+
url =~ @url_pattern_regexp
|
35
|
+
end
|
36
|
+
|
37
|
+
def scrap_content(obj, content_scrapper = nil)
|
38
|
+
doc = Newscrapi::Scrapper.parse_page(obj)
|
39
|
+
@content_xpaths_list.each do |content_xpath|
|
40
|
+
content_section = doc.xpath(content_xpath)
|
41
|
+
if content_section.count > 0
|
42
|
+
content = content_section.to_a.join("\n")
|
43
|
+
content = content_scrapper.clean_content(content) unless content_scrapper.nil?
|
44
|
+
return content
|
45
|
+
end
|
46
|
+
end
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
@@ -0,0 +1,129 @@
|
|
1
|
+
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
require 'newscrapi/encoding'
|
6
|
+
require 'newscrapi/mapping'
|
7
|
+
|
8
|
+
module Newscrapi
|
9
|
+
|
10
|
+
class Scrapper
|
11
|
+
|
12
|
+
class << self
|
13
|
+
attr_accessor :default_config_file, :default
|
14
|
+
default_config_file = "#{File.dirname(__FILE__)}/../config/content_scrapper.rb"
|
15
|
+
|
16
|
+
def create_new_default(*args)
|
17
|
+
self.default = self.new(*args)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def set_as_default
|
22
|
+
Newscrapi::Scrapper.default = self
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :content_mappings, :scrapping_exception_handler_block,
|
26
|
+
:missing_url_matcher_handler_block, :missing_content_handler_block
|
27
|
+
|
28
|
+
def self.parse_page(obj)
|
29
|
+
return obj if obj.class == Nokogiri::HTML::Document
|
30
|
+
Nokogiri::HTML(obj)
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(scrapper_config_file = nil)
|
34
|
+
@content_mappings = []
|
35
|
+
config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file
|
36
|
+
self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
def encode_to(encoding = nil)
|
40
|
+
@encode_to = encoding unless encoding.nil?
|
41
|
+
@encode_to
|
42
|
+
end
|
43
|
+
|
44
|
+
def content_mapping(&block)
|
45
|
+
new_mapping = Newscrapi::Mapping.new
|
46
|
+
new_mapping.instance_eval(&block)
|
47
|
+
@content_mappings << new_mapping
|
48
|
+
end
|
49
|
+
|
50
|
+
def clean_content(content)
|
51
|
+
@content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
|
52
|
+
end
|
53
|
+
|
54
|
+
def sanitize_tags(&sanitize_settings)
|
55
|
+
@content_cleaner_block = lambda do |content|
|
56
|
+
require 'sanitize'
|
57
|
+
Sanitize.clean(content, sanitize_settings.call())
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def loofah_tags(scrap_type)
|
62
|
+
@content_scrapper_block = lambda do |content|
|
63
|
+
require 'loofah'
|
64
|
+
Loofah.document(content).scrub!(scrap_type).to_s
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def matching_content_mapper(url)
|
69
|
+
content_mappings.each { | content_mapping | return content_mapping if content_mapping.matches_url?(url) }
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
def scrap_content(url, options = {})
|
74
|
+
if (content_mapping = matching_content_mapper(url)).nil?
|
75
|
+
@missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
|
76
|
+
return nil
|
77
|
+
end
|
78
|
+
return nil if content_mapping.content_xpaths_list.empty?
|
79
|
+
begin
|
80
|
+
use_page = ensure_encoding(options[:use_page] || Kernel.open(url).read)
|
81
|
+
|
82
|
+
doc = Newscrapi::Scrapper.parse_page(use_page)
|
83
|
+
scrapped_content = content_mapping.scrap_content(doc, content_scrapper = self)
|
84
|
+
|
85
|
+
@missing_content_handler_block.call(url) if !@missing_content_handler_block.nil? and scrapped_content.nil?
|
86
|
+
return scrapped_content
|
87
|
+
rescue Exception
|
88
|
+
@scrapping_exception_handler_block.call($!, url) unless @scrapping_exception_handler_block.nil?
|
89
|
+
return nil
|
90
|
+
end
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
def rescue_scrapping(&block)
|
95
|
+
@scrapping_exception_handler_block = block
|
96
|
+
end
|
97
|
+
|
98
|
+
def missing_url_matcher(&block)
|
99
|
+
@missing_url_matcher_handler_block = block
|
100
|
+
end
|
101
|
+
|
102
|
+
def missing_content(&block)
|
103
|
+
@missing_content_handler_block = block
|
104
|
+
end
|
105
|
+
|
106
|
+
def report_to_stderr
|
107
|
+
rescue_scrapping do |exception, url|
|
108
|
+
STDERR << "error occured during scrapping page #{url}\n"
|
109
|
+
STDERR << "#{exception.message}\n"
|
110
|
+
STDERR << exception.backtrace.join("\n")
|
111
|
+
end
|
112
|
+
|
113
|
+
missing_url_matcher do |url|
|
114
|
+
STDERR << "missing matcher for #{url}\n"
|
115
|
+
end
|
116
|
+
|
117
|
+
missing_content do |url|
|
118
|
+
STDERR << "empty content for #{url}\n"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
def ensure_encoding(str)
|
125
|
+
Newscrapi::Encoding::get_html_doc_with_changed_encoding(str, self.encode_to)
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'newscrapi/scrapper'
|
2
|
+
|
3
|
+
class Newscrapi::Scrapper
|
4
|
+
|
5
|
+
alias :old_initialize :initialize
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
old_initialize
|
9
|
+
testing_report_to_stderr
|
10
|
+
end
|
11
|
+
|
12
|
+
def testing_report_to_stderr
|
13
|
+
rescue_scrapping do |exception, url|
|
14
|
+
# extended_exception = Exception.new("error occured during scrapping page #{url}: #{exception.message}")
|
15
|
+
# extended_exception.set_backtrace(exception.backtrace)
|
16
|
+
raise exception #extended_exception
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/rails/init.rb
ADDED
data/test/helper.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
|
2
|
+
require 'helper'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'newscrapi/encoding'
|
5
|
+
|
6
|
+
class TestEncoding < Test::Unit::TestCase
|
7
|
+
|
8
|
+
context "on guessing the encoding of a page with a metatag defined" do
|
9
|
+
setup do
|
10
|
+
@page = File.open("#{File.dirname(__FILE__)}/test_pages/windows-1250_page.html").read
|
11
|
+
@doc = Nokogiri::HTML(@page)
|
12
|
+
end
|
13
|
+
should "detect the page encoding correctly for string input" do
|
14
|
+
assert_equal 'windows-1250', Newscrapi::Encoding.guess_html_encoding(@page)
|
15
|
+
end
|
16
|
+
should "detect the page encoding correctly for parsed document input" do
|
17
|
+
assert_equal 'windows-1250', Newscrapi::Encoding.guess_html_encoding(@doc)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "on guessing the encoding of a page without the encoding metatag defined" do
|
22
|
+
setup do
|
23
|
+
@page = File.open("#{File.dirname(__FILE__)}/test_pages/utf-8_page.html").read
|
24
|
+
@doc = Nokogiri::HTML(@page)
|
25
|
+
end
|
26
|
+
should "detect the page encoding correctly for string input" do
|
27
|
+
assert_equal 'utf-8', Newscrapi::Encoding.guess_html_encoding(@page)
|
28
|
+
end
|
29
|
+
should "detect the page encoding correctly for parsed document input" do
|
30
|
+
assert_equal 'utf-8', Newscrapi::Encoding.guess_html_encoding(@doc)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "on not supported class type encoding guessing" do
|
35
|
+
should "raise exception" do
|
36
|
+
assert_raise Exception do
|
37
|
+
Newscrapi::Encoding.guess_html_encoding(5)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
require 'helper'
|
3
|
+
require 'mocha'
|
4
|
+
|
5
|
+
class TestMapping < Test::Unit::TestCase
|
6
|
+
|
7
|
+
context "on empty content mapping creation" do
|
8
|
+
setup do
|
9
|
+
@mapping = Newscrapi::Mapping.new
|
10
|
+
@mapping.instance_eval do
|
11
|
+
url_pattern /^http:\/\/www\.matchme\.com\//
|
12
|
+
content_at '//div[@id="failing_content"]'
|
13
|
+
content_at '//div[@id="itext_content"]'
|
14
|
+
content_at '//div[@id="itext_second_content"]'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
should "match the right urls" do
|
18
|
+
assert @mapping.matches_url?('http://www.matchme.com/')
|
19
|
+
end
|
20
|
+
should "not match the wrong urls" do
|
21
|
+
assert !@mapping.matches_url?('https://www.somethingelse.org/hfas')
|
22
|
+
end
|
23
|
+
context "scrapping content for a specific site" do
|
24
|
+
setup do
|
25
|
+
pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
|
26
|
+
@document = Nokogiri::HTML(pretty_content)
|
27
|
+
end
|
28
|
+
should "extract the content" do
|
29
|
+
assert_match(%r{<p><strong>This is a strong text</strong></p>},
|
30
|
+
@mapping.scrap_content(@document))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
context "on document with two content parts" do
|
34
|
+
setup do
|
35
|
+
two_content = File.open("#{File.dirname(__FILE__)}/test_pages/twocontent.html").read
|
36
|
+
@document = Nokogiri::HTML(two_content)
|
37
|
+
end
|
38
|
+
should "evaluate the contents in the order as they were added" do
|
39
|
+
assert_match(%r{The first one is matched}, @mapping.scrap_content(@document))
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "on url matcher definition using wildcards" do
|
45
|
+
setup do
|
46
|
+
@mapping = Newscrapi::Mapping.new
|
47
|
+
@mapping.instance_eval do
|
48
|
+
url_pattern 'http://*.example.com/*'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
should "match urls with matching wildcards" do
|
52
|
+
assert @mapping.matches_url?('http://test.example.com/path/to/doc.html')
|
53
|
+
end
|
54
|
+
should "not match urls with not matching wildcards" do
|
55
|
+
assert !@mapping.matches_url?('http://test.example2.com/path/to/doc.html')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/test/test_pages.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
require 'helper'
|
3
|
+
require 'mocha'
|
4
|
+
|
5
|
+
class TestPages < Test::Unit::TestCase
|
6
|
+
|
7
|
+
context "on page containing CDATA" do
|
8
|
+
setup do
|
9
|
+
@scrapper = Newscrapi::Scrapper.new
|
10
|
+
@scrapper.instance_eval do
|
11
|
+
content_mapping do
|
12
|
+
url_pattern /.*/
|
13
|
+
content_at '//div[@class="art-full adwords-text"]'
|
14
|
+
end
|
15
|
+
loofah_tags(:strip)
|
16
|
+
end
|
17
|
+
@scrapper.rescue_scrapping do |exception,url|
|
18
|
+
puts exception
|
19
|
+
end
|
20
|
+
cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
|
21
|
+
Kernel.expects(:open).returns(StringIO.new(cdata_content))
|
22
|
+
end
|
23
|
+
should "not escape the cdata entries, should leave cdata unvisible" do
|
24
|
+
assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "on page encoding conversion" do
|
29
|
+
setup do
|
30
|
+
@scrapper = Newscrapi::Scrapper.new
|
31
|
+
@scrapper.instance_eval do
|
32
|
+
encode_to 'utf-8'
|
33
|
+
content_mapping do
|
34
|
+
url_pattern /.*/
|
35
|
+
content_at '//div[@id="itext_content"]'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
content = File.open("#{File.dirname(__FILE__)}/test_pages/windows-1250_page.html").read
|
39
|
+
Kernel.expects(:open).returns(StringIO.new(content))
|
40
|
+
end
|
41
|
+
should "convert the document to utf-8 encoding" do
|
42
|
+
require 'rchardet'
|
43
|
+
scrapped_page = @scrapper.scrap_content('http://hop.kop')
|
44
|
+
assert_equal 'utf-8', CharDet.detect(scrapped_page)['encoding']
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
=begin
|
49
|
+
context "on pattern for page for encoding" do
|
50
|
+
setup do
|
51
|
+
@scrapper = Newscrapi::Scrapper.new
|
52
|
+
@scrapper.instance_eval do
|
53
|
+
encode_to 'windows-1250'
|
54
|
+
content_mapping do
|
55
|
+
url_pattern /^http:\/\/www\.matchme\.com\//
|
56
|
+
content_at '//div[@class="node node-story"]/div[@class="content"]/p'
|
57
|
+
suppose_encoding 'utf-8'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
CharDet.expects(:detect).never
|
61
|
+
@page = File.open("#{File.dirname(__FILE__)}/test_pages/page_without_encoding_meta_tag.html").read
|
62
|
+
end
|
63
|
+
should "scrap a page with converted content" do
|
64
|
+
assert_not_nil @scrapper.scrap_content('http://www.matchme.com', @page)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
=end
|
68
|
+
end
|
69
|
+
|