newscrapi 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.document +5 -0
  2. data/.gitignore +23 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +17 -0
  5. data/Rakefile +56 -0
  6. data/VERSION +1 -0
  7. data/config/content_scrapper.rb +3 -0
  8. data/doc/classes/ContentMapping.html +242 -0
  9. data/doc/classes/ContentMapping.src/M000001.html +18 -0
  10. data/doc/classes/ContentMapping.src/M000002.html +18 -0
  11. data/doc/classes/ContentMapping.src/M000003.html +18 -0
  12. data/doc/classes/ContentMapping.src/M000004.html +19 -0
  13. data/doc/classes/ContentMapping.src/M000005.html +18 -0
  14. data/doc/classes/ContentMapping.src/M000006.html +25 -0
  15. data/doc/classes/ContentScrapper.html +297 -0
  16. data/doc/classes/ContentScrapper.src/M000007.html +18 -0
  17. data/doc/classes/ContentScrapper.src/M000008.html +18 -0
  18. data/doc/classes/ContentScrapper.src/M000009.html +20 -0
  19. data/doc/classes/ContentScrapper.src/M000010.html +20 -0
  20. data/doc/classes/ContentScrapper.src/M000011.html +18 -0
  21. data/doc/classes/ContentScrapper.src/M000012.html +21 -0
  22. data/doc/classes/ContentScrapper.src/M000013.html +21 -0
  23. data/doc/classes/ContentScrapper.src/M000014.html +33 -0
  24. data/doc/classes/ContentScrapper.src/M000015.html +18 -0
  25. data/doc/classes/ContentScrapper.src/M000016.html +18 -0
  26. data/doc/classes/Feedzirra.html +111 -0
  27. data/doc/classes/Feedzirra/FeedEntryUtilities.html +152 -0
  28. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000017.html +18 -0
  29. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000018.html +18 -0
  30. data/doc/created.rid +1 -0
  31. data/doc/files/lib/content_scrapper/content_mapping_rb.html +108 -0
  32. data/doc/files/lib/content_scrapper/feedzirra_rb.html +115 -0
  33. data/doc/files/lib/content_scrapper_rb.html +112 -0
  34. data/doc/fr_class_index.html +30 -0
  35. data/doc/fr_file_index.html +29 -0
  36. data/doc/fr_method_index.html +44 -0
  37. data/doc/index.html +24 -0
  38. data/doc/rdoc-style.css +208 -0
  39. data/lib/newscrapi.rb +2 -0
  40. data/lib/newscrapi/encoding.rb +44 -0
  41. data/lib/newscrapi/feedzirra.rb +17 -0
  42. data/lib/newscrapi/mapping.rb +50 -0
  43. data/lib/newscrapi/scrapper.rb +129 -0
  44. data/lib/newscrapi/testing.rb +19 -0
  45. data/rails/init.rb +3 -0
  46. data/test/helper.rb +9 -0
  47. data/test/test_encoding.rb +43 -0
  48. data/test/test_mapping.rb +58 -0
  49. data/test/test_pages.rb +69 -0
  50. data/test/test_pages/cdata.html +23 -0
  51. data/test/test_pages/page_without_encoding_meta_tag.html +401 -0
  52. data/test/test_pages/pretty.html +17 -0
  53. data/test/test_pages/pretty_missing_content.html +17 -0
  54. data/test/test_pages/twocontent.html +11 -0
  55. data/test/test_pages/ugly.html +399 -0
  56. data/test/test_pages/utf-8_page.html +405 -0
  57. data/test/test_pages/windows-1250_page.html +460 -0
  58. data/test/test_scrapper.rb +257 -0
  59. metadata +191 -0
data/lib/newscrapi.rb ADDED
@@ -0,0 +1,2 @@
1
+
2
+ require 'newscrapi/scrapper'
@@ -0,0 +1,44 @@
1
+
2
+ require 'rchardet'
3
+ require 'nokogiri'
4
+ require 'iconv'
5
+
6
+ module Newscrapi
7
+ module Encoding
8
+
9
+ def self.guess_html_encoding(obj)
10
+ doc, page = parse_parameters_doc_page(obj)
11
+
12
+ meta_encoding = doc.meta_encoding
13
+ return meta_encoding unless meta_encoding.nil?
14
+ CharDet.detect(page)['encoding']
15
+ end
16
+
17
+ def self.get_html_doc_with_changed_encoding(obj, encode_to)
18
+ doc, page = parse_parameters_doc_page(obj)
19
+
20
+ if encode_to
21
+ guessed_encoding = guess_html_encoding(page)
22
+ if guessed_encoding != encode_to
23
+ doc = doc.serialize(:encoding => encode_to)
24
+ page = doc.to_s
25
+ return Nokogiri::HTML(page)
26
+ end
27
+ end
28
+ doc
29
+ end
30
+
31
+ private
32
+
33
+ def self.parse_parameters_doc_page(obj)
34
+ if (obj.class == String)
35
+ page = obj
36
+ doc = Nokogiri::HTML(page)
37
+ elsif (obj.class == Nokogiri::HTML::Document)
38
+ doc = obj
39
+ page = doc.to_s
40
+ else raise Exception.new("Not supported type #{obj.class.to_s}") end
41
+ return doc, page
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ # feedzirra entries are extended by methods for scrapping content
2
+ require 'feedzirra/feed_entry_utilities'
3
+
4
+ module Feedzirra
5
+ module FeedEntryUtilities
6
+
7
+ # Scrap the content based on the URL and the existing content and return it
8
+ def scrap_content(scrapper = Newscrapi::Scrapper.default, options = {})
9
+ scrapper.scrap_content(self.url, options) || self.content.to_s
10
+ end
11
+
12
+ # Scrap the content or use the existing one and change the feed entry
13
+ def scrap_content!(scrapper = Newscrapi::Scrapper.default, options = {})
14
+ self.content = scrap_content(scrapper, options)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,50 @@
1
+ require 'newscrapi/scrapper'
2
+ require 'iconv'
3
+
4
+ class Newscrapi::Mapping
5
+
6
+ attr_reader :content_xpaths_list, :url_pattern_regexp
7
+
8
+ def initialize
9
+ @content_xpaths_list = []
10
+ end
11
+
12
+ def url_pattern(pattern)
13
+ @url_pattern_regexp = pattern.class == String ?
14
+ Regexp.compile("^#{Regexp.escape(pattern).gsub('\*','.*')}$") : pattern
15
+ end
16
+
17
+ def content_at(content_xpath)
18
+ @content_xpaths_list << content_xpath
19
+ end
20
+
21
+ def iconv(args)
22
+ suppose_encoding(args[:from])
23
+ convert_to(args[:to])
24
+ end
25
+
26
+ =begin
27
+ def suppose_encoding(encoding = nil)
28
+ return @supposed_encoding if encoding.nil?
29
+ @supposed_encoding = encoding
30
+ end
31
+ =end
32
+
33
+ def matches_url?(url)
34
+ url =~ @url_pattern_regexp
35
+ end
36
+
37
+ def scrap_content(obj, content_scrapper = nil)
38
+ doc = Newscrapi::Scrapper.parse_page(obj)
39
+ @content_xpaths_list.each do |content_xpath|
40
+ content_section = doc.xpath(content_xpath)
41
+ if content_section.count > 0
42
+ content = content_section.to_a.join("\n")
43
+ content = content_scrapper.clean_content(content) unless content_scrapper.nil?
44
+ return content
45
+ end
46
+ end
47
+ nil
48
+ end
49
+ end
50
+
@@ -0,0 +1,129 @@
1
+
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ require 'newscrapi/encoding'
6
+ require 'newscrapi/mapping'
7
+
8
+ module Newscrapi
9
+
10
+ class Scrapper
11
+
12
+ class << self
13
+ attr_accessor :default_config_file, :default
14
+ default_config_file = "#{File.dirname(__FILE__)}/../config/content_scrapper.rb"
15
+
16
+ def create_new_default(*args)
17
+ self.default = self.new(*args)
18
+ end
19
+ end
20
+
21
+ def set_as_default
22
+ Newscrapi::Scrapper.default = self
23
+ end
24
+
25
+ attr_reader :content_mappings, :scrapping_exception_handler_block,
26
+ :missing_url_matcher_handler_block, :missing_content_handler_block
27
+
28
+ def self.parse_page(obj)
29
+ return obj if obj.class == Nokogiri::HTML::Document
30
+ Nokogiri::HTML(obj)
31
+ end
32
+
33
+ def initialize(scrapper_config_file = nil)
34
+ @content_mappings = []
35
+ config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file
36
+ self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
37
+ end
38
+
39
+ def encode_to(encoding = nil)
40
+ @encode_to = encoding unless encoding.nil?
41
+ @encode_to
42
+ end
43
+
44
+ def content_mapping(&block)
45
+ new_mapping = Newscrapi::Mapping.new
46
+ new_mapping.instance_eval(&block)
47
+ @content_mappings << new_mapping
48
+ end
49
+
50
+ def clean_content(content)
51
+ @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
52
+ end
53
+
54
+ def sanitize_tags(&sanitize_settings)
55
+ @content_cleaner_block = lambda do |content|
56
+ require 'sanitize'
57
+ Sanitize.clean(content, sanitize_settings.call())
58
+ end
59
+ end
60
+
61
+ def loofah_tags(scrap_type)
62
+ @content_scrapper_block = lambda do |content|
63
+ require 'loofah'
64
+ Loofah.document(content).scrub!(scrap_type).to_s
65
+ end
66
+ end
67
+
68
+ def matching_content_mapper(url)
69
+ content_mappings.each { | content_mapping | return content_mapping if content_mapping.matches_url?(url) }
70
+ nil
71
+ end
72
+
73
+ def scrap_content(url, options = {})
74
+ if (content_mapping = matching_content_mapper(url)).nil?
75
+ @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
76
+ return nil
77
+ end
78
+ return nil if content_mapping.content_xpaths_list.empty?
79
+ begin
80
+ use_page = ensure_encoding(options[:use_page] || Kernel.open(url).read)
81
+
82
+ doc = Newscrapi::Scrapper.parse_page(use_page)
83
+ scrapped_content = content_mapping.scrap_content(doc, content_scrapper = self)
84
+
85
+ @missing_content_handler_block.call(url) if !@missing_content_handler_block.nil? and scrapped_content.nil?
86
+ return scrapped_content
87
+ rescue Exception
88
+ @scrapping_exception_handler_block.call($!, url) unless @scrapping_exception_handler_block.nil?
89
+ return nil
90
+ end
91
+ nil
92
+ end
93
+
94
+ def rescue_scrapping(&block)
95
+ @scrapping_exception_handler_block = block
96
+ end
97
+
98
+ def missing_url_matcher(&block)
99
+ @missing_url_matcher_handler_block = block
100
+ end
101
+
102
+ def missing_content(&block)
103
+ @missing_content_handler_block = block
104
+ end
105
+
106
+ def report_to_stderr
107
+ rescue_scrapping do |exception, url|
108
+ STDERR << "error occured during scrapping page #{url}\n"
109
+ STDERR << "#{exception.message}\n"
110
+ STDERR << exception.backtrace.join("\n")
111
+ end
112
+
113
+ missing_url_matcher do |url|
114
+ STDERR << "missing matcher for #{url}\n"
115
+ end
116
+
117
+ missing_content do |url|
118
+ STDERR << "empty content for #{url}\n"
119
+ end
120
+ end
121
+
122
+ private
123
+
124
+ def ensure_encoding(str)
125
+ Newscrapi::Encoding::get_html_doc_with_changed_encoding(str, self.encode_to)
126
+ end
127
+
128
+ end
129
+ end
@@ -0,0 +1,19 @@
1
+ require 'newscrapi/scrapper'
2
+
3
+ class Newscrapi::Scrapper
4
+
5
+ alias :old_initialize :initialize
6
+
7
+ def initialize
8
+ old_initialize
9
+ testing_report_to_stderr
10
+ end
11
+
12
+ def testing_report_to_stderr
13
+ rescue_scrapping do |exception, url|
14
+ # extended_exception = Exception.new("error occured during scrapping page #{url}: #{exception.message}")
15
+ # extended_exception.set_backtrace(exception.backtrace)
16
+ raise exception #extended_exception
17
+ end
18
+ end
19
+ end
data/rails/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'newscrapi'
2
+
3
+ Newscrapi::Scrapper.default_config_file = "#{RAILS_ROOT}/config/content_scrapper.rb"
data/test/helper.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+
8
+ require 'newscrapi/scrapper'
9
+ require 'newscrapi/testing'
@@ -0,0 +1,43 @@
1
+
2
+ require 'helper'
3
+ require 'nokogiri'
4
+ require 'newscrapi/encoding'
5
+
6
+ class TestEncoding < Test::Unit::TestCase
7
+
8
+ context "on guessing the encoding of a page with a metatag defined" do
9
+ setup do
10
+ @page = File.open("#{File.dirname(__FILE__)}/test_pages/windows-1250_page.html").read
11
+ @doc = Nokogiri::HTML(@page)
12
+ end
13
+ should "detect the page encoding correctly for string input" do
14
+ assert_equal 'windows-1250', Newscrapi::Encoding.guess_html_encoding(@page)
15
+ end
16
+ should "detect the page encoding correctly for parsed document input" do
17
+ assert_equal 'windows-1250', Newscrapi::Encoding.guess_html_encoding(@doc)
18
+ end
19
+ end
20
+
21
+ context "on guessing the encoding of a page without the encoding metatag defined" do
22
+ setup do
23
+ @page = File.open("#{File.dirname(__FILE__)}/test_pages/utf-8_page.html").read
24
+ @doc = Nokogiri::HTML(@page)
25
+ end
26
+ should "detect the page encoding correctly for string input" do
27
+ assert_equal 'utf-8', Newscrapi::Encoding.guess_html_encoding(@page)
28
+ end
29
+ should "detect the page encoding correctly for parsed document input" do
30
+ assert_equal 'utf-8', Newscrapi::Encoding.guess_html_encoding(@doc)
31
+ end
32
+ end
33
+
34
+ context "on not supported class type encoding guessing" do
35
+ should "raise exception" do
36
+ assert_raise Exception do
37
+ Newscrapi::Encoding.guess_html_encoding(5)
38
+ end
39
+ end
40
+ end
41
+
42
+ end
43
+
@@ -0,0 +1,58 @@
1
+
2
+ require 'helper'
3
+ require 'mocha'
4
+
5
+ class TestMapping < Test::Unit::TestCase
6
+
7
+ context "on empty content mapping creation" do
8
+ setup do
9
+ @mapping = Newscrapi::Mapping.new
10
+ @mapping.instance_eval do
11
+ url_pattern /^http:\/\/www\.matchme\.com\//
12
+ content_at '//div[@id="failing_content"]'
13
+ content_at '//div[@id="itext_content"]'
14
+ content_at '//div[@id="itext_second_content"]'
15
+ end
16
+ end
17
+ should "match the right urls" do
18
+ assert @mapping.matches_url?('http://www.matchme.com/')
19
+ end
20
+ should "not match the wrong urls" do
21
+ assert !@mapping.matches_url?('https://www.somethingelse.org/hfas')
22
+ end
23
+ context "scrapping content for a specific site" do
24
+ setup do
25
+ pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
26
+ @document = Nokogiri::HTML(pretty_content)
27
+ end
28
+ should "extract the content" do
29
+ assert_match(%r{<p><strong>This is a strong text</strong></p>},
30
+ @mapping.scrap_content(@document))
31
+ end
32
+ end
33
+ context "on document with two content parts" do
34
+ setup do
35
+ two_content = File.open("#{File.dirname(__FILE__)}/test_pages/twocontent.html").read
36
+ @document = Nokogiri::HTML(two_content)
37
+ end
38
+ should "evaluate the contents in the order as they were added" do
39
+ assert_match(%r{The first one is matched}, @mapping.scrap_content(@document))
40
+ end
41
+ end
42
+ end
43
+
44
+ context "on url matcher definition using wildcards" do
45
+ setup do
46
+ @mapping = Newscrapi::Mapping.new
47
+ @mapping.instance_eval do
48
+ url_pattern 'http://*.example.com/*'
49
+ end
50
+ end
51
+ should "match urls with matching wildcards" do
52
+ assert @mapping.matches_url?('http://test.example.com/path/to/doc.html')
53
+ end
54
+ should "not match urls with not matching wildcards" do
55
+ assert !@mapping.matches_url?('http://test.example2.com/path/to/doc.html')
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,69 @@
1
+
2
+ require 'helper'
3
+ require 'mocha'
4
+
5
+ class TestPages < Test::Unit::TestCase
6
+
7
+ context "on page containing CDATA" do
8
+ setup do
9
+ @scrapper = Newscrapi::Scrapper.new
10
+ @scrapper.instance_eval do
11
+ content_mapping do
12
+ url_pattern /.*/
13
+ content_at '//div[@class="art-full adwords-text"]'
14
+ end
15
+ loofah_tags(:strip)
16
+ end
17
+ @scrapper.rescue_scrapping do |exception,url|
18
+ puts exception
19
+ end
20
+ cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
21
+ Kernel.expects(:open).returns(StringIO.new(cdata_content))
22
+ end
23
+ should "not escape the cdata entries, should leave cdata unvisible" do
24
+ assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
25
+ end
26
+ end
27
+
28
+ context "on page encoding conversion" do
29
+ setup do
30
+ @scrapper = Newscrapi::Scrapper.new
31
+ @scrapper.instance_eval do
32
+ encode_to 'utf-8'
33
+ content_mapping do
34
+ url_pattern /.*/
35
+ content_at '//div[@id="itext_content"]'
36
+ end
37
+ end
38
+ content = File.open("#{File.dirname(__FILE__)}/test_pages/windows-1250_page.html").read
39
+ Kernel.expects(:open).returns(StringIO.new(content))
40
+ end
41
+ should "convert the document to utf-8 encoding" do
42
+ require 'rchardet'
43
+ scrapped_page = @scrapper.scrap_content('http://hop.kop')
44
+ assert_equal 'utf-8', CharDet.detect(scrapped_page)['encoding']
45
+ end
46
+ end
47
+
48
+ =begin
49
+ context "on pattern for page for encoding" do
50
+ setup do
51
+ @scrapper = Newscrapi::Scrapper.new
52
+ @scrapper.instance_eval do
53
+ encode_to 'windows-1250'
54
+ content_mapping do
55
+ url_pattern /^http:\/\/www\.matchme\.com\//
56
+ content_at '//div[@class="node node-story"]/div[@class="content"]/p'
57
+ suppose_encoding 'utf-8'
58
+ end
59
+ end
60
+ CharDet.expects(:detect).never
61
+ @page = File.open("#{File.dirname(__FILE__)}/test_pages/page_without_encoding_meta_tag.html").read
62
+ end
63
+ should "scrap a page with converted content" do
64
+ assert_not_nil @scrapper.scrap_content('http://www.matchme.com', @page)
65
+ end
66
+ end
67
+ =end
68
+ end
69
+