web-page-parser 0.10 → 0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -120,7 +120,7 @@ module WebPageParser
120
120
 
121
121
  # Convert the encoding of the given text if necessary
122
122
  def iconv(s)
123
- if class_const(:ICONV)
123
+ if class_const(:ICONV) and ICONV
124
124
  class_const(:ICONV).iconv(s)
125
125
  else
126
126
  s
@@ -2,8 +2,8 @@
2
2
  module WebPageParser
3
3
 
4
4
  class BbcNewsPageParserFactory < WebPageParser::ParserFactory
5
- URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
6
- INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
5
+ URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z]+-)?[0-9]+(\.stm)?$")
6
+ INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
7
7
 
8
8
  def self.can_parse?(options)
9
9
  if INVALID_URL_RE.match(options[:url])
@@ -14,7 +14,7 @@ module WebPageParser
14
14
  end
15
15
 
16
16
  def self.create(options = {})
17
- BbcNewsPageParserV2.new(options)
17
+ BbcNewsPageParserV4.new(options)
18
18
  end
19
19
  end
20
20
 
@@ -65,9 +65,9 @@ module WebPageParser
65
65
  STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
66
66
  STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
67
67
  STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
68
- STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
68
+ STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
69
69
  WHITESPACE_RE = ORegexp.new('[\t ]+')
70
- PARA_RE = Regexp.new('</?p[^>]*>')
70
+ PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
71
71
 
72
72
  private
73
73
 
@@ -90,4 +90,44 @@ module WebPageParser
90
90
  end
91
91
 
92
92
  end
93
+
94
+ class BbcNewsPageParserV3 < BbcNewsPageParserV2
95
+ CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
96
+ STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
97
+ STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
98
+ ICONV = nil # BBC news is now in utf8
99
+
100
+ def content_processor
101
+ @content = STRIP_FEATURES_RE.gsub(@content, '')
102
+ @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
103
+ super
104
+ end
105
+ end
106
+
107
+ class BbcNewsPageParserV4 < BbcNewsPageParserV3
108
+ CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
109
+ STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
110
+ STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
111
+ STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
112
+ STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
113
+ TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
114
+ STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
115
+ STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
116
+ STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
117
+ STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
118
+
119
+ def content_processor
120
+ @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
121
+ @content = STRIP_STORY_DATE.gsub(@content, '')
122
+ @content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
123
+ @content = STRIP_STORY_TIME.gsub(@content, '')
124
+ @content = TITLE_RE.gsub(@content, '')
125
+ @content = STRIP_CAPTIONS_RE2.gsub(@content, '')
126
+ @content = STRIP_HIDDEN_A.gsub(@content, '')
127
+ @content = STRIP_STORY_FEATURE.gsub(@content, '')
128
+ @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
129
+ super
130
+ end
131
+ end
132
+
93
133
  end