web-page-parser 0.10 → 0.21

Sign up to get free protection for your applications and to get access to all the features.
@@ -120,7 +120,7 @@ module WebPageParser
120
120
 
121
121
  # Convert the encoding of the given text if necessary
122
122
  def iconv(s)
123
- if class_const(:ICONV)
123
+ if class_const(:ICONV) and ICONV
124
124
  class_const(:ICONV).iconv(s)
125
125
  else
126
126
  s
@@ -2,8 +2,8 @@
2
2
  module WebPageParser
3
3
 
4
4
  class BbcNewsPageParserFactory < WebPageParser::ParserFactory
5
- URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
6
- INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
5
+ URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z]+-)?[0-9]+(\.stm)?$")
6
+ INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
7
7
 
8
8
  def self.can_parse?(options)
9
9
  if INVALID_URL_RE.match(options[:url])
@@ -14,7 +14,7 @@ module WebPageParser
14
14
  end
15
15
 
16
16
  def self.create(options = {})
17
- BbcNewsPageParserV2.new(options)
17
+ BbcNewsPageParserV4.new(options)
18
18
  end
19
19
  end
20
20
 
@@ -65,9 +65,9 @@ module WebPageParser
65
65
  STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
66
66
  STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
67
67
  STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
68
- STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
68
+ STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
69
69
  WHITESPACE_RE = ORegexp.new('[\t ]+')
70
- PARA_RE = Regexp.new('</?p[^>]*>')
70
+ PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
71
71
 
72
72
  private
73
73
 
@@ -90,4 +90,44 @@ module WebPageParser
90
90
  end
91
91
 
92
92
  end
93
+
94
+ class BbcNewsPageParserV3 < BbcNewsPageParserV2
95
+ CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
96
+ STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
97
+ STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
98
+ ICONV = nil # BBC news is now in utf8
99
+
100
+ def content_processor
101
+ @content = STRIP_FEATURES_RE.gsub(@content, '')
102
+ @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
103
+ super
104
+ end
105
+ end
106
+
107
+ class BbcNewsPageParserV4 < BbcNewsPageParserV3
108
+ CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
109
+ STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
110
+ STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
111
+ STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
112
+ STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
113
+ TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
114
+ STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
115
+ STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
116
+ STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
117
+ STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
118
+
119
+ def content_processor
120
+ @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
121
+ @content = STRIP_STORY_DATE.gsub(@content, '')
122
+ @content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
123
+ @content = STRIP_STORY_TIME.gsub(@content, '')
124
+ @content = TITLE_RE.gsub(@content, '')
125
+ @content = STRIP_CAPTIONS_RE2.gsub(@content, '')
126
+ @content = STRIP_HIDDEN_A.gsub(@content, '')
127
+ @content = STRIP_STORY_FEATURE.gsub(@content, '')
128
+ @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
129
+ super
130
+ end
131
+ end
132
+
93
133
  end