web-page-parser 0.25 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +1 -0
- data.tar.gz.sig +0 -0
- data/README.rdoc +5 -0
- data/lib/web-page-parser.rb +31 -0
- data/lib/web-page-parser/base_parser.rb +92 -42
- data/lib/web-page-parser/http.rb +63 -0
- data/lib/web-page-parser/parser_factory.rb +0 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
- data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
- data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
- data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
- data/spec/base_parser_spec.rb +24 -8
- data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
- data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
- data/spec/fixtures/bbc_news/21528631.html +2021 -0
- data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
- data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
- data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
- data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
- data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
- data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
- data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
- data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
- data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
- data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
- data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
- data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
- data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
- data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
- data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
- data/spec/parser_factory_spec.rb +3 -3
- data/spec/parsers/bbc_news_page_spec.rb +223 -3
- data/spec/parsers/guardian_page_spec.rb +157 -4
- data/spec/parsers/independent_page_parser_spec.rb +152 -0
- data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
- data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
- data/spec/spec_helper.rb +5 -0
- metadata +167 -59
- metadata.gz.sig +2 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e1076ec5c2d36f32055c1d8996bd632ae9ec8c41
|
4
|
+
data.tar.gz: bd448302e1e04cf6d022f747959b0740367e75af
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 31f642be9f27c32b59fd2cdf0e1fd19f17fb6f0f55f10d506cae97923bc72ca64b508aa296f2bce5345a6bbbec98d7b8ffd3f7fe92ae80fff914dad29e906c16
|
7
|
+
data.tar.gz: 330bfa9cf1e96e7c0cd98c51ca1f0d63de85fcfec11d074fdcac5ac538d86dd3e6f7237f39d28c24d32121fe04fd6331caf4c96d674b7f1923c5262b53b89574
|
checksums.yaml.gz.sig
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Q��ϟi{��/wW�i����U��k�[#j:�,5�%vm[�"{Eug�^��S8�z2���Tb���d����a ʲ����Ƈ��@�\��j�Hg4�#�l��M�F����RQ^܅��ŭ��Hu���%�QY��2^:T�U��S^3%&W��?�KA���1�0����o�z=8x/6/��v�|�"�vԬ�9.t`�rS�c�/�k ��z+�"o�����_�L��ʷ��B{d/�y�Ʋ�|����
|
data.tar.gz.sig
ADDED
Binary file
|
data/README.rdoc
CHANGED
@@ -31,6 +31,11 @@ they change.
|
|
31
31
|
puts page.date # 2009-05-09T18:58:59+00:00
|
32
32
|
puts page.content.first # The wife of author Ken Follett and ...
|
33
33
|
|
34
|
+
== Ruby 1.8 support
|
35
|
+
|
36
|
+
Installing the Oniguruma gem on Ruby 1.8 will make Web Page Parser run
|
37
|
+
faster, it's highly recommended but not required.
|
38
|
+
|
34
39
|
== More Info
|
35
40
|
|
36
41
|
Web Page Parser was written by {John Leach}[http://johnleach.co.uk]
|
data/lib/web-page-parser.rb
CHANGED
@@ -1,4 +1,35 @@
|
|
1
1
|
# $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
|
2
2
|
|
3
|
+
# Try using oniguruma on Ruby 1.8, if it's available
|
4
|
+
if RUBY_VERSION =~ /^1.8/
|
5
|
+
begin
|
6
|
+
require 'oniguruma'
|
7
|
+
rescue LoadError
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# New Sniffer was originally developed against oniguruma, so when it's
|
12
|
+
# not available we just provide a compatible interface. This is a bit
|
13
|
+
# silly, especially for Ruby 1.9 (where it's built in!), but it saves
|
14
|
+
# changing lots of code.
|
15
|
+
unless defined?(Oniguruma)
|
16
|
+
module Oniguruma
|
17
|
+
class ORegexp < Regexp
|
18
|
+
|
19
|
+
def self.new(r, options = "")
|
20
|
+
ropts = 0
|
21
|
+
ropts = ropts | Regexp::MULTILINE if options =~ /m/
|
22
|
+
ropts = ropts | Regexp::IGNORECASE if options =~ /i/
|
23
|
+
super(r, ropts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def gsub(a, b)
|
27
|
+
a.gsub(self, b)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'web-page-parser/http.rb'
|
3
34
|
require 'web-page-parser/base_parser.rb'
|
4
35
|
require 'web-page-parser/parser_factory.rb'
|
@@ -2,46 +2,115 @@
|
|
2
2
|
module WebPageParser
|
3
3
|
require 'digest'
|
4
4
|
require 'date'
|
5
|
-
require 'oniguruma'
|
6
5
|
require 'htmlentities'
|
7
|
-
require 'iconv'
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
class RetrieveError < StandardError ; end
|
8
|
+
|
9
|
+
class BaseParser
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :retrieve_session
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :url, :guid
|
16
|
+
|
17
|
+
# takes a hash of options. The :url option passes the page url, and
|
18
|
+
# the :page option passes the raw html page content for parsing
|
19
|
+
def initialize(options = { })
|
20
|
+
@url = options[:url]
|
21
|
+
@page = options[:page]
|
22
|
+
end
|
23
|
+
|
24
|
+
# return the page contents, retrieving it from the server if necessary
|
25
|
+
def page
|
26
|
+
@page ||= retrieve_page
|
27
|
+
end
|
28
|
+
|
29
|
+
# request the page from the server and return the raw contents
|
30
|
+
def retrieve_page(rurl = nil)
|
31
|
+
durl = rurl || url
|
32
|
+
return nil unless durl
|
33
|
+
durl = filter_url(durl) if self.respond_to?(:filter_url)
|
34
|
+
self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
|
35
|
+
self.class.retrieve_session.get(durl)
|
36
|
+
end
|
37
|
+
|
38
|
+
def title
|
39
|
+
@title
|
40
|
+
end
|
41
|
+
|
42
|
+
def content
|
43
|
+
@content || []
|
44
|
+
end
|
45
|
+
|
46
|
+
def date
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return a hash representing the textual content of this web page
|
50
|
+
def hash
|
51
|
+
digest = Digest::MD5.new
|
52
|
+
digest << title.to_s
|
53
|
+
digest << content.join('').to_s
|
54
|
+
digest.to_s
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# BaseRegexpParser is designed to be sub-classed to write new
|
60
|
+
# parsers that use regular. It provides some basic help but most of
|
61
|
+
# the work needs to be done by the sub-class.
|
12
62
|
#
|
13
63
|
# Simple pages could be implemented by just defining new regular
|
14
64
|
# expression constants, but more advanced parsing can be achieved
|
15
65
|
# with the *_processor methods.
|
16
66
|
#
|
17
|
-
class BaseParser
|
67
|
+
class BaseRegexpParser < BaseParser
|
18
68
|
include Oniguruma
|
19
69
|
|
20
|
-
attr_reader :url, :guid, :page
|
21
|
-
|
22
|
-
ICONV = Iconv.new("utf8", "iso-8859-1")
|
23
70
|
|
24
71
|
# The regular expression to extract the title
|
25
72
|
TITLE_RE = //
|
26
|
-
|
73
|
+
|
27
74
|
# The regular expression to extract the date
|
28
75
|
DATE_RE = //
|
29
|
-
|
76
|
+
|
30
77
|
# The regular expression to extract the content
|
31
78
|
CONTENT_RE = //
|
32
|
-
|
79
|
+
|
33
80
|
# The regular expression to find all characters that should be
|
34
81
|
# removed from any content.
|
35
82
|
KILL_CHARS_RE = ORegexp.new('[\n\r]+')
|
36
|
-
|
83
|
+
|
37
84
|
# The object used to turn HTML entities into real charaters
|
38
85
|
HTML_ENTITIES_DECODER = HTMLEntities.new
|
39
86
|
|
40
|
-
# takes a has of options. The :url option passes the page url, and
|
41
|
-
# the :page option passes the raw html page content for parsing
|
42
87
|
def initialize(options = { })
|
43
|
-
|
44
|
-
@page =
|
88
|
+
super(options)
|
89
|
+
@page = encode(@page)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Handle any string encoding
|
93
|
+
def encode(s)
|
94
|
+
return s if s.nil?
|
95
|
+
return s if s.valid_encoding?
|
96
|
+
if s.force_encoding("iso-8859-1").valid_encoding?
|
97
|
+
return s.encode('utf-8', 'iso-8859-1')
|
98
|
+
end
|
99
|
+
s
|
100
|
+
end
|
101
|
+
|
102
|
+
# return the page contents, retrieving it from the server if necessary
|
103
|
+
def page
|
104
|
+
@page ||= retrieve_page
|
105
|
+
end
|
106
|
+
|
107
|
+
# request the page from the server and return the raw contents
|
108
|
+
def retrieve_page(rurl = nil)
|
109
|
+
durl = rurl || url
|
110
|
+
return nil unless durl
|
111
|
+
durl = filter_url(durl) if self.respond_to?(:filter_url)
|
112
|
+
self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
|
113
|
+
encode(self.class.retrieve_session.get(durl))
|
45
114
|
end
|
46
115
|
|
47
116
|
# The title method returns the title of the web page.
|
@@ -54,7 +123,6 @@ module WebPageParser
|
|
54
123
|
if matches = class_const(:TITLE_RE).match(page)
|
55
124
|
@title = matches[1].to_s.strip
|
56
125
|
title_processor
|
57
|
-
@title = iconv(@title)
|
58
126
|
@title = decode_entities(@title)
|
59
127
|
end
|
60
128
|
end
|
@@ -89,59 +157,41 @@ module WebPageParser
|
|
89
157
|
matches = class_const(:CONTENT_RE).match(page)
|
90
158
|
if matches
|
91
159
|
@content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
|
92
|
-
@content = iconv(@content)
|
93
160
|
content_processor
|
94
161
|
@content.collect! { |p| decode_entities(p.strip) }
|
95
|
-
@content.delete_if { |p| p == '' or p.nil? }
|
162
|
+
@content.delete_if { |p| p == '' or p.nil? }
|
96
163
|
end
|
97
164
|
@content = [] if @content.nil?
|
98
165
|
@content
|
99
166
|
end
|
100
167
|
|
101
|
-
# Return a hash representing the textual content of this web page
|
102
|
-
def hash
|
103
|
-
digest = Digest::MD5.new
|
104
|
-
digest << title.to_s
|
105
|
-
digest << content.to_s
|
106
|
-
digest.to_s
|
107
|
-
end
|
108
|
-
|
109
168
|
# Convert html entities to unicode
|
110
169
|
def decode_entities(s)
|
111
170
|
HTML_ENTITIES_DECODER.decode(s)
|
112
171
|
end
|
113
|
-
|
172
|
+
|
114
173
|
private
|
115
|
-
|
174
|
+
|
116
175
|
# get the constant from this objects class
|
117
176
|
def class_const(sym)
|
118
177
|
self.class.const_get(sym)
|
119
178
|
end
|
120
179
|
|
121
|
-
# Convert the encoding of the given text if necessary
|
122
|
-
def iconv(s)
|
123
|
-
if class_const(:ICONV) and ICONV
|
124
|
-
class_const(:ICONV).iconv(s)
|
125
|
-
else
|
126
|
-
s
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
180
|
# Custom content parsing. It should split the @content up into an
|
131
181
|
# array of paragraphs. Conversion to utf8 is done after this method.
|
132
182
|
def content_processor
|
133
183
|
@content = @content.split(/<p>/)
|
134
184
|
end
|
135
|
-
|
185
|
+
|
136
186
|
# Custom date parsing. It should parse @date into a DateTime object
|
137
187
|
def date_processor
|
138
188
|
end
|
139
|
-
|
189
|
+
|
140
190
|
# Custom title parsing. It should clean up @title as
|
141
191
|
# necessary. Conversion to utf8 is done after this method.
|
142
192
|
def title_processor
|
143
193
|
end
|
144
|
-
|
194
|
+
|
145
195
|
end
|
146
196
|
|
147
197
|
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module WebPageParser
|
2
|
+
module HTTP
|
3
|
+
require 'curb'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
class Response < String
|
7
|
+
attr_accessor :curl
|
8
|
+
|
9
|
+
def initialize(s, curl)
|
10
|
+
self.curl = curl
|
11
|
+
super(s)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Session
|
16
|
+
|
17
|
+
class CurlError < StandardError ; end
|
18
|
+
|
19
|
+
def curl
|
20
|
+
@curl ||= Curl::Easy.new do |c|
|
21
|
+
c.timeout = 8
|
22
|
+
c.connect_timeout = 8
|
23
|
+
c.dns_cache_timeout = 600
|
24
|
+
c.enable_cookies = true
|
25
|
+
c.follow_location = true
|
26
|
+
c.autoreferer = true
|
27
|
+
c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
|
28
|
+
c.headers["Accept-encoding"] = 'gzip, deflate'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def get(url)
|
33
|
+
curl.url = url
|
34
|
+
if curl.perform == false
|
35
|
+
raise CurlError, "curl.perform returned false"
|
36
|
+
end
|
37
|
+
uncompressed = gunzip(curl.body_str)
|
38
|
+
uncompressed = inflate(curl.body_str) if uncompressed.nil?
|
39
|
+
final_body = uncompressed || curl.body_str
|
40
|
+
if final_body.respond_to?(:force_encoding)
|
41
|
+
# Not sure if this is right. works for BBC/Guardian/New York Times anyway
|
42
|
+
final_body.force_encoding("utf-8")
|
43
|
+
end
|
44
|
+
Response.new(final_body, curl)
|
45
|
+
end
|
46
|
+
|
47
|
+
def inflate(s)
|
48
|
+
Zlib::Inflate.inflate(s)
|
49
|
+
rescue Zlib::DataError
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
|
53
|
+
def gunzip(s)
|
54
|
+
s = StringIO.new(s)
|
55
|
+
Zlib::GzipReader.new(s).read
|
56
|
+
rescue Zlib::DataError
|
57
|
+
rescue Zlib::GzipFile::Error
|
58
|
+
nil
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -15,7 +15,7 @@ module WebPageParser
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def self.create(options = {})
|
18
|
-
|
18
|
+
BbcNewsPageParserV5.new(options)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -23,7 +23,7 @@ module WebPageParser
|
|
23
23
|
# old News Sniffer BbcNewsPage class did. This should only ever
|
24
24
|
# be used for backwards compatability with News Sniffer and is
|
25
25
|
# never supplied for use by a factory.
|
26
|
-
class BbcNewsPageParserV1 < WebPageParser::
|
26
|
+
class BbcNewsPageParserV1 < WebPageParser::BaseRegexpParser
|
27
27
|
|
28
28
|
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
29
29
|
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
@@ -34,11 +34,11 @@ module WebPageParser
|
|
34
34
|
|
35
35
|
def hash
|
36
36
|
# Old News Sniffer only hashed the content, not the title
|
37
|
-
Digest::MD5.hexdigest(content.to_s)
|
37
|
+
Digest::MD5.hexdigest(content.join('').to_s)
|
38
38
|
end
|
39
39
|
|
40
40
|
private
|
41
|
-
|
41
|
+
|
42
42
|
def date_processor
|
43
43
|
begin
|
44
44
|
# OPD is in GMT/UTC, which DateTime seems to use by default
|
@@ -58,7 +58,7 @@ module WebPageParser
|
|
58
58
|
end
|
59
59
|
|
60
60
|
# BbcNewsPageParserV2 parses BBC News web pages
|
61
|
-
class BbcNewsPageParserV2 < WebPageParser::
|
61
|
+
class BbcNewsPageParserV2 < WebPageParser::BaseRegexpParser
|
62
62
|
|
63
63
|
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
64
64
|
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
@@ -96,8 +96,8 @@ module WebPageParser
|
|
96
96
|
CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
|
97
97
|
STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
|
98
98
|
STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
|
99
|
-
|
100
|
-
|
99
|
+
# BBC news is now in utf8
|
100
|
+
|
101
101
|
def content_processor
|
102
102
|
@content = STRIP_FEATURES_RE.gsub(@content, '')
|
103
103
|
@content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
|
@@ -118,7 +118,8 @@ module WebPageParser
|
|
118
118
|
STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
|
119
119
|
STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
|
120
120
|
STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
|
121
|
-
|
121
|
+
STRIP_TWITTER_WIDGET_RE = ORegexp.new('<div[^>]+twitter\-module.*?</ul>','m')
|
122
|
+
STRIP_TWITTER_WIDGET2_RE = ORegexp.new('<ul[^>]+tweets.+?</ul>.+?<ul[^>]+links.+?</ul>', 'm')
|
122
123
|
def content_processor
|
123
124
|
@content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
|
124
125
|
@content = STRIP_STORY_DATE.gsub(@content, '')
|
@@ -131,8 +132,70 @@ module WebPageParser
|
|
131
132
|
@content = STRIP_HYPERPUFF_RE.gsub(@content, '')
|
132
133
|
@content = STRIP_MARKETDATA_RE.gsub(@content, '')
|
133
134
|
@content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
|
135
|
+
@content = STRIP_TWITTER_WIDGET_RE.gsub(@content, '')
|
136
|
+
@content = STRIP_TWITTER_WIDGET2_RE.gsub(@content, '')
|
134
137
|
super
|
135
138
|
end
|
136
139
|
end
|
137
|
-
|
140
|
+
|
141
|
+
|
142
|
+
class BbcNewsPageParserV5 < WebPageParser::BaseParser
|
143
|
+
require 'nokogiri'
|
144
|
+
|
145
|
+
def html_doc
|
146
|
+
@html_document ||= Nokogiri::HTML(page)
|
147
|
+
end
|
148
|
+
|
149
|
+
def title
|
150
|
+
return @title if @title
|
151
|
+
@title = html_doc.css('h1.story-header').text.strip
|
152
|
+
|
153
|
+
# for older bbc articles
|
154
|
+
if @title.empty?
|
155
|
+
@title = html_doc.css('div#meta-information h1').text.strip
|
156
|
+
end
|
157
|
+
|
158
|
+
# for very old bbc articles
|
159
|
+
if @title.empty?
|
160
|
+
if headline_meta = html_doc.at_css('meta[name=Headline]')
|
161
|
+
@title = headline_meta['content'].to_s.strip
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
@title
|
166
|
+
end
|
167
|
+
|
168
|
+
def content
|
169
|
+
return @content if @content
|
170
|
+
@content = []
|
171
|
+
story_body = html_doc.css('div.story-body')
|
172
|
+
|
173
|
+
# for older bbc articles
|
174
|
+
if story_body.children.empty?
|
175
|
+
story_body = html_doc.css('div#story-body')
|
176
|
+
end
|
177
|
+
|
178
|
+
# for very old bbc articles
|
179
|
+
if story_body.children.empty?
|
180
|
+
story_body = html_doc.css('td.storybody')
|
181
|
+
end
|
182
|
+
|
183
|
+
story_body.children.each do |n|
|
184
|
+
@content << n.text.strip if n.name == 'p'
|
185
|
+
@content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
|
186
|
+
end
|
187
|
+
@content
|
188
|
+
end
|
189
|
+
|
190
|
+
def date
|
191
|
+
return @date if @date
|
192
|
+
if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
|
193
|
+
@date = DateTime.parse(date_meta['content']) rescue nil
|
194
|
+
end
|
195
|
+
@date
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
|
138
201
|
end
|