web-page-parser 0.10 → 0.21
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/web-page-parser/base_parser.rb +1 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +45 -5
- data/spec/fixtures/bbc_news/10249066.stm.html +1361 -0
- data/spec/fixtures/bbc_news/10341015.stm.html +1278 -0
- data/spec/fixtures/bbc_news/11125504.html +1481 -0
- data/spec/parsers/bbc_news_page_spec.rb +65 -3
- metadata +58 -23
@@ -2,8 +2,8 @@
|
|
2
2
|
module WebPageParser
|
3
3
|
|
4
4
|
class BbcNewsPageParserFactory < WebPageParser::ParserFactory
|
5
|
-
URL_RE = ORegexp.new("news\.bbc\.co\.uk
|
6
|
-
INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
|
5
|
+
URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z]+-)?[0-9]+(\.stm)?$")
|
6
|
+
INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
|
7
7
|
|
8
8
|
def self.can_parse?(options)
|
9
9
|
if INVALID_URL_RE.match(options[:url])
|
@@ -14,7 +14,7 @@ module WebPageParser
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.create(options = {})
|
17
|
-
|
17
|
+
BbcNewsPageParserV4.new(options)
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
@@ -65,9 +65,9 @@ module WebPageParser
|
|
65
65
|
STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
|
66
66
|
STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
|
67
67
|
STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
|
68
|
-
STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption
|
68
|
+
STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
|
69
69
|
WHITESPACE_RE = ORegexp.new('[\t ]+')
|
70
|
-
PARA_RE = Regexp.new('</?p[^>]*>')
|
70
|
+
PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
|
71
71
|
|
72
72
|
private
|
73
73
|
|
@@ -90,4 +90,44 @@ module WebPageParser
|
|
90
90
|
end
|
91
91
|
|
92
92
|
end
|
93
|
+
|
94
|
+
class BbcNewsPageParserV3 < BbcNewsPageParserV2
|
95
|
+
CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
|
96
|
+
STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
|
97
|
+
STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
|
98
|
+
ICONV = nil # BBC news is now in utf8
|
99
|
+
|
100
|
+
def content_processor
|
101
|
+
@content = STRIP_FEATURES_RE.gsub(@content, '')
|
102
|
+
@content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
|
103
|
+
super
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
class BbcNewsPageParserV4 < BbcNewsPageParserV3
|
108
|
+
CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
|
109
|
+
STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
|
110
|
+
STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
|
111
|
+
STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
|
112
|
+
STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
|
113
|
+
TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
|
114
|
+
STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
|
115
|
+
STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
|
116
|
+
STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
|
117
|
+
STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
|
118
|
+
|
119
|
+
def content_processor
|
120
|
+
@content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
|
121
|
+
@content = STRIP_STORY_DATE.gsub(@content, '')
|
122
|
+
@content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
|
123
|
+
@content = STRIP_STORY_TIME.gsub(@content, '')
|
124
|
+
@content = TITLE_RE.gsub(@content, '')
|
125
|
+
@content = STRIP_CAPTIONS_RE2.gsub(@content, '')
|
126
|
+
@content = STRIP_HIDDEN_A.gsub(@content, '')
|
127
|
+
@content = STRIP_STORY_FEATURE.gsub(@content, '')
|
128
|
+
@content = STRIP_HYPERPUFF_RE.gsub(@content, '')
|
129
|
+
super
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
93
133
|
end
|