web-page-parser 0.25 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +1 -0
  3. data.tar.gz.sig +0 -0
  4. data/README.rdoc +5 -0
  5. data/lib/web-page-parser.rb +31 -0
  6. data/lib/web-page-parser/base_parser.rb +92 -42
  7. data/lib/web-page-parser/http.rb +63 -0
  8. data/lib/web-page-parser/parser_factory.rb +0 -1
  9. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
  10. data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
  11. data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
  12. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
  13. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
  14. data/spec/base_parser_spec.rb +24 -8
  15. data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
  16. data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
  17. data/spec/fixtures/bbc_news/21528631.html +2021 -0
  18. data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
  19. data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
  20. data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
  21. data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
  22. data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
  23. data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
  24. data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
  25. data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
  26. data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
  27. data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
  28. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
  29. data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
  30. data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
  31. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
  32. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
  33. data/spec/parser_factory_spec.rb +3 -3
  34. data/spec/parsers/bbc_news_page_spec.rb +223 -3
  35. data/spec/parsers/guardian_page_spec.rb +157 -4
  36. data/spec/parsers/independent_page_parser_spec.rb +152 -0
  37. data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
  38. data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
  39. data/spec/spec_helper.rb +5 -0
  40. metadata +167 -59
  41. metadata.gz.sig +2 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e1076ec5c2d36f32055c1d8996bd632ae9ec8c41
4
+ data.tar.gz: bd448302e1e04cf6d022f747959b0740367e75af
5
+ SHA512:
6
+ metadata.gz: 31f642be9f27c32b59fd2cdf0e1fd19f17fb6f0f55f10d506cae97923bc72ca64b508aa296f2bce5345a6bbbec98d7b8ffd3f7fe92ae80fff914dad29e906c16
7
+ data.tar.gz: 330bfa9cf1e96e7c0cd98c51ca1f0d63de85fcfec11d074fdcac5ac538d86dd3e6f7237f39d28c24d32121fe04fd6331caf4c96d674b7f1923c5262b53b89574
@@ -0,0 +1 @@
1
+ Q��ϟi{��/wW�i����U��k�[#j: �,5�%vm[�"{Eu g�^��S8�z2���Tb���d����a ʲ����Ƈ��@�\��j�Hg4�#�l��M�F����RQ^܅��׶ŭ��Hu���%�QY��2^:T�U��S^3%&W��? �KA���1�0����o�z=8x/6/��v�|�"�vԬ�9.t`�rS�c�/�k ��z+�"o�����_�L��ʷ��B{d/�y�Ʋ�|����
Binary file
@@ -31,6 +31,11 @@ they change.
31
31
  puts page.date # 2009-05-09T18:58:59+00:00
32
32
  puts page.content.first # The wife of author Ken Follett and ...
33
33
 
34
+ == Ruby 1.8 support
35
+
36
+ Installing the Oniguruma gem on Ruby 1.8 will make Web Page Parser run
37
+ faster, it's highly recommended but not required.
38
+
34
39
  == More Info
35
40
 
36
41
  Web Page Parser was written by {John Leach}[http://johnleach.co.uk]
@@ -1,4 +1,35 @@
1
1
  # $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
2
2
 
3
+ # Try using oniguruma on Ruby 1.8, if it's available
4
+ if RUBY_VERSION =~ /^1.8/
5
+ begin
6
+ require 'oniguruma'
7
+ rescue LoadError
8
+ end
9
+ end
10
+
11
+ # New Sniffer was originally developed against oniguruma, so when it's
12
+ # not available we just provide a compatible interface. This is a bit
13
+ # silly, especially for Ruby 1.9 (where it's built in!), but it saves
14
+ # changing lots of code.
15
+ unless defined?(Oniguruma)
16
+ module Oniguruma
17
+ class ORegexp < Regexp
18
+
19
+ def self.new(r, options = "")
20
+ ropts = 0
21
+ ropts = ropts | Regexp::MULTILINE if options =~ /m/
22
+ ropts = ropts | Regexp::IGNORECASE if options =~ /i/
23
+ super(r, ropts)
24
+ end
25
+
26
+ def gsub(a, b)
27
+ a.gsub(self, b)
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ require 'web-page-parser/http.rb'
3
34
  require 'web-page-parser/base_parser.rb'
4
35
  require 'web-page-parser/parser_factory.rb'
@@ -2,46 +2,115 @@
2
2
  module WebPageParser
3
3
  require 'digest'
4
4
  require 'date'
5
- require 'oniguruma'
6
5
  require 'htmlentities'
7
- require 'iconv'
8
6
 
9
- # BaseParse is designed to be sub-classed to write new parsers. It
10
- # provides some basic help but most of the work needs to be done by
11
- # the sub-class.
7
+ class RetrieveError < StandardError ; end
8
+
9
+ class BaseParser
10
+
11
+ class << self
12
+ attr_accessor :retrieve_session
13
+ end
14
+
15
+ attr_reader :url, :guid
16
+
17
+ # takes a hash of options. The :url option passes the page url, and
18
+ # the :page option passes the raw html page content for parsing
19
+ def initialize(options = { })
20
+ @url = options[:url]
21
+ @page = options[:page]
22
+ end
23
+
24
+ # return the page contents, retrieving it from the server if necessary
25
+ def page
26
+ @page ||= retrieve_page
27
+ end
28
+
29
+ # request the page from the server and return the raw contents
30
+ def retrieve_page(rurl = nil)
31
+ durl = rurl || url
32
+ return nil unless durl
33
+ durl = filter_url(durl) if self.respond_to?(:filter_url)
34
+ self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
35
+ self.class.retrieve_session.get(durl)
36
+ end
37
+
38
+ def title
39
+ @title
40
+ end
41
+
42
+ def content
43
+ @content || []
44
+ end
45
+
46
+ def date
47
+ end
48
+
49
+ # Return a hash representing the textual content of this web page
50
+ def hash
51
+ digest = Digest::MD5.new
52
+ digest << title.to_s
53
+ digest << content.join('').to_s
54
+ digest.to_s
55
+ end
56
+
57
+ end
58
+
59
+ # BaseRegexpParser is designed to be sub-classed to write new
60
+ # parsers that use regular. It provides some basic help but most of
61
+ # the work needs to be done by the sub-class.
12
62
  #
13
63
  # Simple pages could be implemented by just defining new regular
14
64
  # expression constants, but more advanced parsing can be achieved
15
65
  # with the *_processor methods.
16
66
  #
17
- class BaseParser
67
+ class BaseRegexpParser < BaseParser
18
68
  include Oniguruma
19
69
 
20
- attr_reader :url, :guid, :page
21
-
22
- ICONV = Iconv.new("utf8", "iso-8859-1")
23
70
 
24
71
  # The regular expression to extract the title
25
72
  TITLE_RE = //
26
-
73
+
27
74
  # The regular expression to extract the date
28
75
  DATE_RE = //
29
-
76
+
30
77
  # The regular expression to extract the content
31
78
  CONTENT_RE = //
32
-
79
+
33
80
  # The regular expression to find all characters that should be
34
81
  # removed from any content.
35
82
  KILL_CHARS_RE = ORegexp.new('[\n\r]+')
36
-
83
+
37
84
  # The object used to turn HTML entities into real charaters
38
85
  HTML_ENTITIES_DECODER = HTMLEntities.new
39
86
 
40
- # takes a has of options. The :url option passes the page url, and
41
- # the :page option passes the raw html page content for parsing
42
87
  def initialize(options = { })
43
- @url = options[:url]
44
- @page = options[:page]
88
+ super(options)
89
+ @page = encode(@page)
90
+ end
91
+
92
+ # Handle any string encoding
93
+ def encode(s)
94
+ return s if s.nil?
95
+ return s if s.valid_encoding?
96
+ if s.force_encoding("iso-8859-1").valid_encoding?
97
+ return s.encode('utf-8', 'iso-8859-1')
98
+ end
99
+ s
100
+ end
101
+
102
+ # return the page contents, retrieving it from the server if necessary
103
+ def page
104
+ @page ||= retrieve_page
105
+ end
106
+
107
+ # request the page from the server and return the raw contents
108
+ def retrieve_page(rurl = nil)
109
+ durl = rurl || url
110
+ return nil unless durl
111
+ durl = filter_url(durl) if self.respond_to?(:filter_url)
112
+ self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
113
+ encode(self.class.retrieve_session.get(durl))
45
114
  end
46
115
 
47
116
  # The title method returns the title of the web page.
@@ -54,7 +123,6 @@ module WebPageParser
54
123
  if matches = class_const(:TITLE_RE).match(page)
55
124
  @title = matches[1].to_s.strip
56
125
  title_processor
57
- @title = iconv(@title)
58
126
  @title = decode_entities(@title)
59
127
  end
60
128
  end
@@ -89,59 +157,41 @@ module WebPageParser
89
157
  matches = class_const(:CONTENT_RE).match(page)
90
158
  if matches
91
159
  @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
92
- @content = iconv(@content)
93
160
  content_processor
94
161
  @content.collect! { |p| decode_entities(p.strip) }
95
- @content.delete_if { |p| p == '' or p.nil? }
162
+ @content.delete_if { |p| p == '' or p.nil? }
96
163
  end
97
164
  @content = [] if @content.nil?
98
165
  @content
99
166
  end
100
167
 
101
- # Return a hash representing the textual content of this web page
102
- def hash
103
- digest = Digest::MD5.new
104
- digest << title.to_s
105
- digest << content.to_s
106
- digest.to_s
107
- end
108
-
109
168
  # Convert html entities to unicode
110
169
  def decode_entities(s)
111
170
  HTML_ENTITIES_DECODER.decode(s)
112
171
  end
113
-
172
+
114
173
  private
115
-
174
+
116
175
  # get the constant from this objects class
117
176
  def class_const(sym)
118
177
  self.class.const_get(sym)
119
178
  end
120
179
 
121
- # Convert the encoding of the given text if necessary
122
- def iconv(s)
123
- if class_const(:ICONV) and ICONV
124
- class_const(:ICONV).iconv(s)
125
- else
126
- s
127
- end
128
- end
129
-
130
180
  # Custom content parsing. It should split the @content up into an
131
181
  # array of paragraphs. Conversion to utf8 is done after this method.
132
182
  def content_processor
133
183
  @content = @content.split(/<p>/)
134
184
  end
135
-
185
+
136
186
  # Custom date parsing. It should parse @date into a DateTime object
137
187
  def date_processor
138
188
  end
139
-
189
+
140
190
  # Custom title parsing. It should clean up @title as
141
191
  # necessary. Conversion to utf8 is done after this method.
142
192
  def title_processor
143
193
  end
144
-
194
+
145
195
  end
146
196
 
147
197
 
@@ -0,0 +1,63 @@
1
+ module WebPageParser
2
+ module HTTP
3
+ require 'curb'
4
+ require 'zlib'
5
+
6
+ class Response < String
7
+ attr_accessor :curl
8
+
9
+ def initialize(s, curl)
10
+ self.curl = curl
11
+ super(s)
12
+ end
13
+ end
14
+
15
+ class Session
16
+
17
+ class CurlError < StandardError ; end
18
+
19
+ def curl
20
+ @curl ||= Curl::Easy.new do |c|
21
+ c.timeout = 8
22
+ c.connect_timeout = 8
23
+ c.dns_cache_timeout = 600
24
+ c.enable_cookies = true
25
+ c.follow_location = true
26
+ c.autoreferer = true
27
+ c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
28
+ c.headers["Accept-encoding"] = 'gzip, deflate'
29
+ end
30
+ end
31
+
32
+ def get(url)
33
+ curl.url = url
34
+ if curl.perform == false
35
+ raise CurlError, "curl.perform returned false"
36
+ end
37
+ uncompressed = gunzip(curl.body_str)
38
+ uncompressed = inflate(curl.body_str) if uncompressed.nil?
39
+ final_body = uncompressed || curl.body_str
40
+ if final_body.respond_to?(:force_encoding)
41
+ # Not sure if this is right. works for BBC/Guardian/New York Times anyway
42
+ final_body.force_encoding("utf-8")
43
+ end
44
+ Response.new(final_body, curl)
45
+ end
46
+
47
+ def inflate(s)
48
+ Zlib::Inflate.inflate(s)
49
+ rescue Zlib::DataError
50
+ nil
51
+ end
52
+
53
+ def gunzip(s)
54
+ s = StringIO.new(s)
55
+ Zlib::GzipReader.new(s).read
56
+ rescue Zlib::DataError
57
+ rescue Zlib::GzipFile::Error
58
+ nil
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -1,5 +1,4 @@
1
1
  module WebPageParser
2
- require 'oniguruma'
3
2
  class ParserFactory
4
3
  include Oniguruma
5
4
 
@@ -15,7 +15,7 @@ module WebPageParser
15
15
  end
16
16
 
17
17
  def self.create(options = {})
18
- BbcNewsPageParserV4.new(options)
18
+ BbcNewsPageParserV5.new(options)
19
19
  end
20
20
  end
21
21
 
@@ -23,7 +23,7 @@ module WebPageParser
23
23
  # old News Sniffer BbcNewsPage class did. This should only ever
24
24
  # be used for backwards compatability with News Sniffer and is
25
25
  # never supplied for use by a factory.
26
- class BbcNewsPageParserV1 < WebPageParser::BaseParser
26
+ class BbcNewsPageParserV1 < WebPageParser::BaseRegexpParser
27
27
 
28
28
  TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
29
29
  DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -34,11 +34,11 @@ module WebPageParser
34
34
 
35
35
  def hash
36
36
  # Old News Sniffer only hashed the content, not the title
37
- Digest::MD5.hexdigest(content.to_s)
37
+ Digest::MD5.hexdigest(content.join('').to_s)
38
38
  end
39
39
 
40
40
  private
41
-
41
+
42
42
  def date_processor
43
43
  begin
44
44
  # OPD is in GMT/UTC, which DateTime seems to use by default
@@ -58,7 +58,7 @@ module WebPageParser
58
58
  end
59
59
 
60
60
  # BbcNewsPageParserV2 parses BBC News web pages
61
- class BbcNewsPageParserV2 < WebPageParser::BaseParser
61
+ class BbcNewsPageParserV2 < WebPageParser::BaseRegexpParser
62
62
 
63
63
  TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
64
64
  DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -96,8 +96,8 @@ module WebPageParser
96
96
  CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
97
97
  STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
98
98
  STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
99
- ICONV = nil # BBC news is now in utf8
100
-
99
+ # BBC news is now in utf8
100
+
101
101
  def content_processor
102
102
  @content = STRIP_FEATURES_RE.gsub(@content, '')
103
103
  @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
@@ -118,7 +118,8 @@ module WebPageParser
118
118
  STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
119
119
  STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
120
120
  STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
121
-
121
+ STRIP_TWITTER_WIDGET_RE = ORegexp.new('<div[^>]+twitter\-module.*?</ul>','m')
122
+ STRIP_TWITTER_WIDGET2_RE = ORegexp.new('<ul[^>]+tweets.+?</ul>.+?<ul[^>]+links.+?</ul>', 'm')
122
123
  def content_processor
123
124
  @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
124
125
  @content = STRIP_STORY_DATE.gsub(@content, '')
@@ -131,8 +132,70 @@ module WebPageParser
131
132
  @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
132
133
  @content = STRIP_MARKETDATA_RE.gsub(@content, '')
133
134
  @content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
135
+ @content = STRIP_TWITTER_WIDGET_RE.gsub(@content, '')
136
+ @content = STRIP_TWITTER_WIDGET2_RE.gsub(@content, '')
134
137
  super
135
138
  end
136
139
  end
137
-
140
+
141
+
142
+ class BbcNewsPageParserV5 < WebPageParser::BaseParser
143
+ require 'nokogiri'
144
+
145
+ def html_doc
146
+ @html_document ||= Nokogiri::HTML(page)
147
+ end
148
+
149
+ def title
150
+ return @title if @title
151
+ @title = html_doc.css('h1.story-header').text.strip
152
+
153
+ # for older bbc articles
154
+ if @title.empty?
155
+ @title = html_doc.css('div#meta-information h1').text.strip
156
+ end
157
+
158
+ # for very old bbc articles
159
+ if @title.empty?
160
+ if headline_meta = html_doc.at_css('meta[name=Headline]')
161
+ @title = headline_meta['content'].to_s.strip
162
+ end
163
+ end
164
+
165
+ @title
166
+ end
167
+
168
+ def content
169
+ return @content if @content
170
+ @content = []
171
+ story_body = html_doc.css('div.story-body')
172
+
173
+ # for older bbc articles
174
+ if story_body.children.empty?
175
+ story_body = html_doc.css('div#story-body')
176
+ end
177
+
178
+ # for very old bbc articles
179
+ if story_body.children.empty?
180
+ story_body = html_doc.css('td.storybody')
181
+ end
182
+
183
+ story_body.children.each do |n|
184
+ @content << n.text.strip if n.name == 'p'
185
+ @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
186
+ end
187
+ @content
188
+ end
189
+
190
+ def date
191
+ return @date if @date
192
+ if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
193
+ @date = DateTime.parse(date_meta['content']) rescue nil
194
+ end
195
+ @date
196
+ end
197
+
198
+ end
199
+
200
+
138
201
  end