web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +1 -0
  3. data.tar.gz.sig +0 -0
  4. data/README.rdoc +5 -0
  5. data/lib/web-page-parser.rb +31 -0
  6. data/lib/web-page-parser/base_parser.rb +92 -42
  7. data/lib/web-page-parser/http.rb +63 -0
  8. data/lib/web-page-parser/parser_factory.rb +0 -1
  9. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
  10. data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
  11. data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
  12. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
  13. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
  14. data/spec/base_parser_spec.rb +24 -8
  15. data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
  16. data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
  17. data/spec/fixtures/bbc_news/21528631.html +2021 -0
  18. data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
  19. data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
  20. data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
  21. data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
  22. data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
  23. data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
  24. data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
  25. data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
  26. data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
  27. data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
  28. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
  29. data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
  30. data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
  31. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
  32. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
  33. data/spec/parser_factory_spec.rb +3 -3
  34. data/spec/parsers/bbc_news_page_spec.rb +223 -3
  35. data/spec/parsers/guardian_page_spec.rb +157 -4
  36. data/spec/parsers/independent_page_parser_spec.rb +152 -0
  37. data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
  38. data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
  39. data/spec/spec_helper.rb +5 -0
  40. metadata +167 -59
  41. metadata.gz.sig +2 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e1076ec5c2d36f32055c1d8996bd632ae9ec8c41
4
+ data.tar.gz: bd448302e1e04cf6d022f747959b0740367e75af
5
+ SHA512:
6
+ metadata.gz: 31f642be9f27c32b59fd2cdf0e1fd19f17fb6f0f55f10d506cae97923bc72ca64b508aa296f2bce5345a6bbbec98d7b8ffd3f7fe92ae80fff914dad29e906c16
7
+ data.tar.gz: 330bfa9cf1e96e7c0cd98c51ca1f0d63de85fcfec11d074fdcac5ac538d86dd3e6f7237f39d28c24d32121fe04fd6331caf4c96d674b7f1923c5262b53b89574
@@ -0,0 +1 @@
1
+ Q��ϟi{��/wW�i����U��k�[#j: �,5�%vm[�"{Eu g�^��S8�z2���Tb���d����a ʲ����Ƈ��@�\��j�Hg4�#�l��M�F����RQ^܅��׶ŭ��Hu���%�QY��2^:T�U��S^3%&W��? �KA���1�0����o�z=8x/6/��v�|�"�vԬ�9.t`�rS�c�/�k ��z+�"o�����_�L��ʷ��B{d/�y�Ʋ�|����
Binary file
@@ -31,6 +31,11 @@ they change.
31
31
  puts page.date # 2009-05-09T18:58:59+00:00
32
32
  puts page.content.first # The wife of author Ken Follett and ...
33
33
 
34
+ == Ruby 1.8 support
35
+
36
+ Installing the Oniguruma gem on Ruby 1.8 will make Web Page Parser run
37
+ faster, it's highly recommended but not required.
38
+
34
39
  == More Info
35
40
 
36
41
  Web Page Parser was written by {John Leach}[http://johnleach.co.uk]
@@ -1,4 +1,35 @@
1
1
  # $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
2
2
 
3
+ # Try using oniguruma on Ruby 1.8, if it's available
4
+ if RUBY_VERSION =~ /^1.8/
5
+ begin
6
+ require 'oniguruma'
7
+ rescue LoadError
8
+ end
9
+ end
10
+
11
+ # New Sniffer was originally developed against oniguruma, so when it's
12
+ # not available we just provide a compatible interface. This is a bit
13
+ # silly, especially for Ruby 1.9 (where it's built in!), but it saves
14
+ # changing lots of code.
15
+ unless defined?(Oniguruma)
16
+ module Oniguruma
17
+ class ORegexp < Regexp
18
+
19
+ def self.new(r, options = "")
20
+ ropts = 0
21
+ ropts = ropts | Regexp::MULTILINE if options =~ /m/
22
+ ropts = ropts | Regexp::IGNORECASE if options =~ /i/
23
+ super(r, ropts)
24
+ end
25
+
26
+ def gsub(a, b)
27
+ a.gsub(self, b)
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ require 'web-page-parser/http.rb'
3
34
  require 'web-page-parser/base_parser.rb'
4
35
  require 'web-page-parser/parser_factory.rb'
@@ -2,46 +2,115 @@
2
2
  module WebPageParser
3
3
  require 'digest'
4
4
  require 'date'
5
- require 'oniguruma'
6
5
  require 'htmlentities'
7
- require 'iconv'
8
6
 
9
- # BaseParse is designed to be sub-classed to write new parsers. It
10
- # provides some basic help but most of the work needs to be done by
11
- # the sub-class.
7
+ class RetrieveError < StandardError ; end
8
+
9
+ class BaseParser
10
+
11
+ class << self
12
+ attr_accessor :retrieve_session
13
+ end
14
+
15
+ attr_reader :url, :guid
16
+
17
+ # takes a hash of options. The :url option passes the page url, and
18
+ # the :page option passes the raw html page content for parsing
19
+ def initialize(options = { })
20
+ @url = options[:url]
21
+ @page = options[:page]
22
+ end
23
+
24
+ # return the page contents, retrieving it from the server if necessary
25
+ def page
26
+ @page ||= retrieve_page
27
+ end
28
+
29
+ # request the page from the server and return the raw contents
30
+ def retrieve_page(rurl = nil)
31
+ durl = rurl || url
32
+ return nil unless durl
33
+ durl = filter_url(durl) if self.respond_to?(:filter_url)
34
+ self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
35
+ self.class.retrieve_session.get(durl)
36
+ end
37
+
38
+ def title
39
+ @title
40
+ end
41
+
42
+ def content
43
+ @content || []
44
+ end
45
+
46
+ def date
47
+ end
48
+
49
+ # Return a hash representing the textual content of this web page
50
+ def hash
51
+ digest = Digest::MD5.new
52
+ digest << title.to_s
53
+ digest << content.join('').to_s
54
+ digest.to_s
55
+ end
56
+
57
+ end
58
+
59
+ # BaseRegexpParser is designed to be sub-classed to write new
60
+ # parsers that use regular. It provides some basic help but most of
61
+ # the work needs to be done by the sub-class.
12
62
  #
13
63
  # Simple pages could be implemented by just defining new regular
14
64
  # expression constants, but more advanced parsing can be achieved
15
65
  # with the *_processor methods.
16
66
  #
17
- class BaseParser
67
+ class BaseRegexpParser < BaseParser
18
68
  include Oniguruma
19
69
 
20
- attr_reader :url, :guid, :page
21
-
22
- ICONV = Iconv.new("utf8", "iso-8859-1")
23
70
 
24
71
  # The regular expression to extract the title
25
72
  TITLE_RE = //
26
-
73
+
27
74
  # The regular expression to extract the date
28
75
  DATE_RE = //
29
-
76
+
30
77
  # The regular expression to extract the content
31
78
  CONTENT_RE = //
32
-
79
+
33
80
  # The regular expression to find all characters that should be
34
81
  # removed from any content.
35
82
  KILL_CHARS_RE = ORegexp.new('[\n\r]+')
36
-
83
+
37
84
  # The object used to turn HTML entities into real charaters
38
85
  HTML_ENTITIES_DECODER = HTMLEntities.new
39
86
 
40
- # takes a has of options. The :url option passes the page url, and
41
- # the :page option passes the raw html page content for parsing
42
87
  def initialize(options = { })
43
- @url = options[:url]
44
- @page = options[:page]
88
+ super(options)
89
+ @page = encode(@page)
90
+ end
91
+
92
+ # Handle any string encoding
93
+ def encode(s)
94
+ return s if s.nil?
95
+ return s if s.valid_encoding?
96
+ if s.force_encoding("iso-8859-1").valid_encoding?
97
+ return s.encode('utf-8', 'iso-8859-1')
98
+ end
99
+ s
100
+ end
101
+
102
+ # return the page contents, retrieving it from the server if necessary
103
+ def page
104
+ @page ||= retrieve_page
105
+ end
106
+
107
+ # request the page from the server and return the raw contents
108
+ def retrieve_page(rurl = nil)
109
+ durl = rurl || url
110
+ return nil unless durl
111
+ durl = filter_url(durl) if self.respond_to?(:filter_url)
112
+ self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
113
+ encode(self.class.retrieve_session.get(durl))
45
114
  end
46
115
 
47
116
  # The title method returns the title of the web page.
@@ -54,7 +123,6 @@ module WebPageParser
54
123
  if matches = class_const(:TITLE_RE).match(page)
55
124
  @title = matches[1].to_s.strip
56
125
  title_processor
57
- @title = iconv(@title)
58
126
  @title = decode_entities(@title)
59
127
  end
60
128
  end
@@ -89,59 +157,41 @@ module WebPageParser
89
157
  matches = class_const(:CONTENT_RE).match(page)
90
158
  if matches
91
159
  @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
92
- @content = iconv(@content)
93
160
  content_processor
94
161
  @content.collect! { |p| decode_entities(p.strip) }
95
- @content.delete_if { |p| p == '' or p.nil? }
162
+ @content.delete_if { |p| p == '' or p.nil? }
96
163
  end
97
164
  @content = [] if @content.nil?
98
165
  @content
99
166
  end
100
167
 
101
- # Return a hash representing the textual content of this web page
102
- def hash
103
- digest = Digest::MD5.new
104
- digest << title.to_s
105
- digest << content.to_s
106
- digest.to_s
107
- end
108
-
109
168
  # Convert html entities to unicode
110
169
  def decode_entities(s)
111
170
  HTML_ENTITIES_DECODER.decode(s)
112
171
  end
113
-
172
+
114
173
  private
115
-
174
+
116
175
  # get the constant from this objects class
117
176
  def class_const(sym)
118
177
  self.class.const_get(sym)
119
178
  end
120
179
 
121
- # Convert the encoding of the given text if necessary
122
- def iconv(s)
123
- if class_const(:ICONV) and ICONV
124
- class_const(:ICONV).iconv(s)
125
- else
126
- s
127
- end
128
- end
129
-
130
180
  # Custom content parsing. It should split the @content up into an
131
181
  # array of paragraphs. Conversion to utf8 is done after this method.
132
182
  def content_processor
133
183
  @content = @content.split(/<p>/)
134
184
  end
135
-
185
+
136
186
  # Custom date parsing. It should parse @date into a DateTime object
137
187
  def date_processor
138
188
  end
139
-
189
+
140
190
  # Custom title parsing. It should clean up @title as
141
191
  # necessary. Conversion to utf8 is done after this method.
142
192
  def title_processor
143
193
  end
144
-
194
+
145
195
  end
146
196
 
147
197
 
@@ -0,0 +1,63 @@
1
+ module WebPageParser
2
+ module HTTP
3
+ require 'curb'
4
+ require 'zlib'
5
+
6
+ class Response < String
7
+ attr_accessor :curl
8
+
9
+ def initialize(s, curl)
10
+ self.curl = curl
11
+ super(s)
12
+ end
13
+ end
14
+
15
+ class Session
16
+
17
+ class CurlError < StandardError ; end
18
+
19
+ def curl
20
+ @curl ||= Curl::Easy.new do |c|
21
+ c.timeout = 8
22
+ c.connect_timeout = 8
23
+ c.dns_cache_timeout = 600
24
+ c.enable_cookies = true
25
+ c.follow_location = true
26
+ c.autoreferer = true
27
+ c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
28
+ c.headers["Accept-encoding"] = 'gzip, deflate'
29
+ end
30
+ end
31
+
32
+ def get(url)
33
+ curl.url = url
34
+ if curl.perform == false
35
+ raise CurlError, "curl.perform returned false"
36
+ end
37
+ uncompressed = gunzip(curl.body_str)
38
+ uncompressed = inflate(curl.body_str) if uncompressed.nil?
39
+ final_body = uncompressed || curl.body_str
40
+ if final_body.respond_to?(:force_encoding)
41
+ # Not sure if this is right. works for BBC/Guardian/New York Times anyway
42
+ final_body.force_encoding("utf-8")
43
+ end
44
+ Response.new(final_body, curl)
45
+ end
46
+
47
+ def inflate(s)
48
+ Zlib::Inflate.inflate(s)
49
+ rescue Zlib::DataError
50
+ nil
51
+ end
52
+
53
+ def gunzip(s)
54
+ s = StringIO.new(s)
55
+ Zlib::GzipReader.new(s).read
56
+ rescue Zlib::DataError
57
+ rescue Zlib::GzipFile::Error
58
+ nil
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -1,5 +1,4 @@
1
1
  module WebPageParser
2
- require 'oniguruma'
3
2
  class ParserFactory
4
3
  include Oniguruma
5
4
 
@@ -15,7 +15,7 @@ module WebPageParser
15
15
  end
16
16
 
17
17
  def self.create(options = {})
18
- BbcNewsPageParserV4.new(options)
18
+ BbcNewsPageParserV5.new(options)
19
19
  end
20
20
  end
21
21
 
@@ -23,7 +23,7 @@ module WebPageParser
23
23
  # old News Sniffer BbcNewsPage class did. This should only ever
24
24
  # be used for backwards compatability with News Sniffer and is
25
25
  # never supplied for use by a factory.
26
- class BbcNewsPageParserV1 < WebPageParser::BaseParser
26
+ class BbcNewsPageParserV1 < WebPageParser::BaseRegexpParser
27
27
 
28
28
  TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
29
29
  DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -34,11 +34,11 @@ module WebPageParser
34
34
 
35
35
  def hash
36
36
  # Old News Sniffer only hashed the content, not the title
37
- Digest::MD5.hexdigest(content.to_s)
37
+ Digest::MD5.hexdigest(content.join('').to_s)
38
38
  end
39
39
 
40
40
  private
41
-
41
+
42
42
  def date_processor
43
43
  begin
44
44
  # OPD is in GMT/UTC, which DateTime seems to use by default
@@ -58,7 +58,7 @@ module WebPageParser
58
58
  end
59
59
 
60
60
  # BbcNewsPageParserV2 parses BBC News web pages
61
- class BbcNewsPageParserV2 < WebPageParser::BaseParser
61
+ class BbcNewsPageParserV2 < WebPageParser::BaseRegexpParser
62
62
 
63
63
  TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
64
64
  DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -96,8 +96,8 @@ module WebPageParser
96
96
  CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
97
97
  STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
98
98
  STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
99
- ICONV = nil # BBC news is now in utf8
100
-
99
+ # BBC news is now in utf8
100
+
101
101
  def content_processor
102
102
  @content = STRIP_FEATURES_RE.gsub(@content, '')
103
103
  @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
@@ -118,7 +118,8 @@ module WebPageParser
118
118
  STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
119
119
  STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
120
120
  STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
121
-
121
+ STRIP_TWITTER_WIDGET_RE = ORegexp.new('<div[^>]+twitter\-module.*?</ul>','m')
122
+ STRIP_TWITTER_WIDGET2_RE = ORegexp.new('<ul[^>]+tweets.+?</ul>.+?<ul[^>]+links.+?</ul>', 'm')
122
123
  def content_processor
123
124
  @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
124
125
  @content = STRIP_STORY_DATE.gsub(@content, '')
@@ -131,8 +132,70 @@ module WebPageParser
131
132
  @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
132
133
  @content = STRIP_MARKETDATA_RE.gsub(@content, '')
133
134
  @content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
135
+ @content = STRIP_TWITTER_WIDGET_RE.gsub(@content, '')
136
+ @content = STRIP_TWITTER_WIDGET2_RE.gsub(@content, '')
134
137
  super
135
138
  end
136
139
  end
137
-
140
+
141
+
142
+ class BbcNewsPageParserV5 < WebPageParser::BaseParser
143
+ require 'nokogiri'
144
+
145
+ def html_doc
146
+ @html_document ||= Nokogiri::HTML(page)
147
+ end
148
+
149
+ def title
150
+ return @title if @title
151
+ @title = html_doc.css('h1.story-header').text.strip
152
+
153
+ # for older bbc articles
154
+ if @title.empty?
155
+ @title = html_doc.css('div#meta-information h1').text.strip
156
+ end
157
+
158
+ # for very old bbc articles
159
+ if @title.empty?
160
+ if headline_meta = html_doc.at_css('meta[name=Headline]')
161
+ @title = headline_meta['content'].to_s.strip
162
+ end
163
+ end
164
+
165
+ @title
166
+ end
167
+
168
+ def content
169
+ return @content if @content
170
+ @content = []
171
+ story_body = html_doc.css('div.story-body')
172
+
173
+ # for older bbc articles
174
+ if story_body.children.empty?
175
+ story_body = html_doc.css('div#story-body')
176
+ end
177
+
178
+ # for very old bbc articles
179
+ if story_body.children.empty?
180
+ story_body = html_doc.css('td.storybody')
181
+ end
182
+
183
+ story_body.children.each do |n|
184
+ @content << n.text.strip if n.name == 'p'
185
+ @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
186
+ end
187
+ @content
188
+ end
189
+
190
+ def date
191
+ return @date if @date
192
+ if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
193
+ @date = DateTime.parse(date_meta['content']) rescue nil
194
+ end
195
+ @date
196
+ end
197
+
198
+ end
199
+
200
+
138
201
  end