web-page-parser 0.25 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +1 -0
- data.tar.gz.sig +0 -0
- data/README.rdoc +5 -0
- data/lib/web-page-parser.rb +31 -0
- data/lib/web-page-parser/base_parser.rb +92 -42
- data/lib/web-page-parser/http.rb +63 -0
- data/lib/web-page-parser/parser_factory.rb +0 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
- data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
- data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
- data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
- data/spec/base_parser_spec.rb +24 -8
- data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
- data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
- data/spec/fixtures/bbc_news/21528631.html +2021 -0
- data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
- data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
- data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
- data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
- data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
- data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
- data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
- data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
- data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
- data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
- data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
- data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
- data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
- data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
- data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
- data/spec/parser_factory_spec.rb +3 -3
- data/spec/parsers/bbc_news_page_spec.rb +223 -3
- data/spec/parsers/guardian_page_spec.rb +157 -4
- data/spec/parsers/independent_page_parser_spec.rb +152 -0
- data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
- data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
- data/spec/spec_helper.rb +5 -0
- metadata +167 -59
- metadata.gz.sig +2 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e1076ec5c2d36f32055c1d8996bd632ae9ec8c41
|
4
|
+
data.tar.gz: bd448302e1e04cf6d022f747959b0740367e75af
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 31f642be9f27c32b59fd2cdf0e1fd19f17fb6f0f55f10d506cae97923bc72ca64b508aa296f2bce5345a6bbbec98d7b8ffd3f7fe92ae80fff914dad29e906c16
|
7
|
+
data.tar.gz: 330bfa9cf1e96e7c0cd98c51ca1f0d63de85fcfec11d074fdcac5ac538d86dd3e6f7237f39d28c24d32121fe04fd6331caf4c96d674b7f1923c5262b53b89574
|
checksums.yaml.gz.sig
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Q��ϟi{��/wW�i����U��k�[#j:�,5�%vm[�"{Eug�^��S8�z2���Tb���d����a ʲ����Ƈ��@�\��j�Hg4�#�l��M�F����RQ^܅��ŭ��Hu���%�QY��2^:T�U��S^3%&W��?�KA���1�0����o�z=8x/6/��v�|�"�vԬ�9.t`�rS�c�/�k ��z+�"o�����_�L��ʷ��B{d/�y�Ʋ�|����
|
data.tar.gz.sig
ADDED
Binary file
|
data/README.rdoc
CHANGED
@@ -31,6 +31,11 @@ they change.
|
|
31
31
|
puts page.date # 2009-05-09T18:58:59+00:00
|
32
32
|
puts page.content.first # The wife of author Ken Follett and ...
|
33
33
|
|
34
|
+
== Ruby 1.8 support
|
35
|
+
|
36
|
+
Installing the Oniguruma gem on Ruby 1.8 will make Web Page Parser run
|
37
|
+
faster, it's highly recommended but not required.
|
38
|
+
|
34
39
|
== More Info
|
35
40
|
|
36
41
|
Web Page Parser was written by {John Leach}[http://johnleach.co.uk]
|
data/lib/web-page-parser.rb
CHANGED
@@ -1,4 +1,35 @@
|
|
1
1
|
# $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
|
2
2
|
|
3
|
+
# Try using oniguruma on Ruby 1.8, if it's available
|
4
|
+
if RUBY_VERSION =~ /^1.8/
|
5
|
+
begin
|
6
|
+
require 'oniguruma'
|
7
|
+
rescue LoadError
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# New Sniffer was originally developed against oniguruma, so when it's
|
12
|
+
# not available we just provide a compatible interface. This is a bit
|
13
|
+
# silly, especially for Ruby 1.9 (where it's built in!), but it saves
|
14
|
+
# changing lots of code.
|
15
|
+
unless defined?(Oniguruma)
|
16
|
+
module Oniguruma
|
17
|
+
class ORegexp < Regexp
|
18
|
+
|
19
|
+
def self.new(r, options = "")
|
20
|
+
ropts = 0
|
21
|
+
ropts = ropts | Regexp::MULTILINE if options =~ /m/
|
22
|
+
ropts = ropts | Regexp::IGNORECASE if options =~ /i/
|
23
|
+
super(r, ropts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def gsub(a, b)
|
27
|
+
a.gsub(self, b)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'web-page-parser/http.rb'
|
3
34
|
require 'web-page-parser/base_parser.rb'
|
4
35
|
require 'web-page-parser/parser_factory.rb'
|
@@ -2,46 +2,115 @@
|
|
2
2
|
module WebPageParser
|
3
3
|
require 'digest'
|
4
4
|
require 'date'
|
5
|
-
require 'oniguruma'
|
6
5
|
require 'htmlentities'
|
7
|
-
require 'iconv'
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
class RetrieveError < StandardError ; end
|
8
|
+
|
9
|
+
class BaseParser
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :retrieve_session
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :url, :guid
|
16
|
+
|
17
|
+
# takes a hash of options. The :url option passes the page url, and
|
18
|
+
# the :page option passes the raw html page content for parsing
|
19
|
+
def initialize(options = { })
|
20
|
+
@url = options[:url]
|
21
|
+
@page = options[:page]
|
22
|
+
end
|
23
|
+
|
24
|
+
# return the page contents, retrieving it from the server if necessary
|
25
|
+
def page
|
26
|
+
@page ||= retrieve_page
|
27
|
+
end
|
28
|
+
|
29
|
+
# request the page from the server and return the raw contents
|
30
|
+
def retrieve_page(rurl = nil)
|
31
|
+
durl = rurl || url
|
32
|
+
return nil unless durl
|
33
|
+
durl = filter_url(durl) if self.respond_to?(:filter_url)
|
34
|
+
self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
|
35
|
+
self.class.retrieve_session.get(durl)
|
36
|
+
end
|
37
|
+
|
38
|
+
def title
|
39
|
+
@title
|
40
|
+
end
|
41
|
+
|
42
|
+
def content
|
43
|
+
@content || []
|
44
|
+
end
|
45
|
+
|
46
|
+
def date
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return a hash representing the textual content of this web page
|
50
|
+
def hash
|
51
|
+
digest = Digest::MD5.new
|
52
|
+
digest << title.to_s
|
53
|
+
digest << content.join('').to_s
|
54
|
+
digest.to_s
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# BaseRegexpParser is designed to be sub-classed to write new
|
60
|
+
# parsers that use regular. It provides some basic help but most of
|
61
|
+
# the work needs to be done by the sub-class.
|
12
62
|
#
|
13
63
|
# Simple pages could be implemented by just defining new regular
|
14
64
|
# expression constants, but more advanced parsing can be achieved
|
15
65
|
# with the *_processor methods.
|
16
66
|
#
|
17
|
-
class BaseParser
|
67
|
+
class BaseRegexpParser < BaseParser
|
18
68
|
include Oniguruma
|
19
69
|
|
20
|
-
attr_reader :url, :guid, :page
|
21
|
-
|
22
|
-
ICONV = Iconv.new("utf8", "iso-8859-1")
|
23
70
|
|
24
71
|
# The regular expression to extract the title
|
25
72
|
TITLE_RE = //
|
26
|
-
|
73
|
+
|
27
74
|
# The regular expression to extract the date
|
28
75
|
DATE_RE = //
|
29
|
-
|
76
|
+
|
30
77
|
# The regular expression to extract the content
|
31
78
|
CONTENT_RE = //
|
32
|
-
|
79
|
+
|
33
80
|
# The regular expression to find all characters that should be
|
34
81
|
# removed from any content.
|
35
82
|
KILL_CHARS_RE = ORegexp.new('[\n\r]+')
|
36
|
-
|
83
|
+
|
37
84
|
# The object used to turn HTML entities into real charaters
|
38
85
|
HTML_ENTITIES_DECODER = HTMLEntities.new
|
39
86
|
|
40
|
-
# takes a has of options. The :url option passes the page url, and
|
41
|
-
# the :page option passes the raw html page content for parsing
|
42
87
|
def initialize(options = { })
|
43
|
-
|
44
|
-
@page =
|
88
|
+
super(options)
|
89
|
+
@page = encode(@page)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Handle any string encoding
|
93
|
+
def encode(s)
|
94
|
+
return s if s.nil?
|
95
|
+
return s if s.valid_encoding?
|
96
|
+
if s.force_encoding("iso-8859-1").valid_encoding?
|
97
|
+
return s.encode('utf-8', 'iso-8859-1')
|
98
|
+
end
|
99
|
+
s
|
100
|
+
end
|
101
|
+
|
102
|
+
# return the page contents, retrieving it from the server if necessary
|
103
|
+
def page
|
104
|
+
@page ||= retrieve_page
|
105
|
+
end
|
106
|
+
|
107
|
+
# request the page from the server and return the raw contents
|
108
|
+
def retrieve_page(rurl = nil)
|
109
|
+
durl = rurl || url
|
110
|
+
return nil unless durl
|
111
|
+
durl = filter_url(durl) if self.respond_to?(:filter_url)
|
112
|
+
self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
|
113
|
+
encode(self.class.retrieve_session.get(durl))
|
45
114
|
end
|
46
115
|
|
47
116
|
# The title method returns the title of the web page.
|
@@ -54,7 +123,6 @@ module WebPageParser
|
|
54
123
|
if matches = class_const(:TITLE_RE).match(page)
|
55
124
|
@title = matches[1].to_s.strip
|
56
125
|
title_processor
|
57
|
-
@title = iconv(@title)
|
58
126
|
@title = decode_entities(@title)
|
59
127
|
end
|
60
128
|
end
|
@@ -89,59 +157,41 @@ module WebPageParser
|
|
89
157
|
matches = class_const(:CONTENT_RE).match(page)
|
90
158
|
if matches
|
91
159
|
@content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
|
92
|
-
@content = iconv(@content)
|
93
160
|
content_processor
|
94
161
|
@content.collect! { |p| decode_entities(p.strip) }
|
95
|
-
@content.delete_if { |p| p == '' or p.nil? }
|
162
|
+
@content.delete_if { |p| p == '' or p.nil? }
|
96
163
|
end
|
97
164
|
@content = [] if @content.nil?
|
98
165
|
@content
|
99
166
|
end
|
100
167
|
|
101
|
-
# Return a hash representing the textual content of this web page
|
102
|
-
def hash
|
103
|
-
digest = Digest::MD5.new
|
104
|
-
digest << title.to_s
|
105
|
-
digest << content.to_s
|
106
|
-
digest.to_s
|
107
|
-
end
|
108
|
-
|
109
168
|
# Convert html entities to unicode
|
110
169
|
def decode_entities(s)
|
111
170
|
HTML_ENTITIES_DECODER.decode(s)
|
112
171
|
end
|
113
|
-
|
172
|
+
|
114
173
|
private
|
115
|
-
|
174
|
+
|
116
175
|
# get the constant from this objects class
|
117
176
|
def class_const(sym)
|
118
177
|
self.class.const_get(sym)
|
119
178
|
end
|
120
179
|
|
121
|
-
# Convert the encoding of the given text if necessary
|
122
|
-
def iconv(s)
|
123
|
-
if class_const(:ICONV) and ICONV
|
124
|
-
class_const(:ICONV).iconv(s)
|
125
|
-
else
|
126
|
-
s
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
180
|
# Custom content parsing. It should split the @content up into an
|
131
181
|
# array of paragraphs. Conversion to utf8 is done after this method.
|
132
182
|
def content_processor
|
133
183
|
@content = @content.split(/<p>/)
|
134
184
|
end
|
135
|
-
|
185
|
+
|
136
186
|
# Custom date parsing. It should parse @date into a DateTime object
|
137
187
|
def date_processor
|
138
188
|
end
|
139
|
-
|
189
|
+
|
140
190
|
# Custom title parsing. It should clean up @title as
|
141
191
|
# necessary. Conversion to utf8 is done after this method.
|
142
192
|
def title_processor
|
143
193
|
end
|
144
|
-
|
194
|
+
|
145
195
|
end
|
146
196
|
|
147
197
|
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module WebPageParser
|
2
|
+
module HTTP
|
3
|
+
require 'curb'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
class Response < String
|
7
|
+
attr_accessor :curl
|
8
|
+
|
9
|
+
def initialize(s, curl)
|
10
|
+
self.curl = curl
|
11
|
+
super(s)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Session
|
16
|
+
|
17
|
+
class CurlError < StandardError ; end
|
18
|
+
|
19
|
+
def curl
|
20
|
+
@curl ||= Curl::Easy.new do |c|
|
21
|
+
c.timeout = 8
|
22
|
+
c.connect_timeout = 8
|
23
|
+
c.dns_cache_timeout = 600
|
24
|
+
c.enable_cookies = true
|
25
|
+
c.follow_location = true
|
26
|
+
c.autoreferer = true
|
27
|
+
c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
|
28
|
+
c.headers["Accept-encoding"] = 'gzip, deflate'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def get(url)
|
33
|
+
curl.url = url
|
34
|
+
if curl.perform == false
|
35
|
+
raise CurlError, "curl.perform returned false"
|
36
|
+
end
|
37
|
+
uncompressed = gunzip(curl.body_str)
|
38
|
+
uncompressed = inflate(curl.body_str) if uncompressed.nil?
|
39
|
+
final_body = uncompressed || curl.body_str
|
40
|
+
if final_body.respond_to?(:force_encoding)
|
41
|
+
# Not sure if this is right. works for BBC/Guardian/New York Times anyway
|
42
|
+
final_body.force_encoding("utf-8")
|
43
|
+
end
|
44
|
+
Response.new(final_body, curl)
|
45
|
+
end
|
46
|
+
|
47
|
+
def inflate(s)
|
48
|
+
Zlib::Inflate.inflate(s)
|
49
|
+
rescue Zlib::DataError
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
|
53
|
+
def gunzip(s)
|
54
|
+
s = StringIO.new(s)
|
55
|
+
Zlib::GzipReader.new(s).read
|
56
|
+
rescue Zlib::DataError
|
57
|
+
rescue Zlib::GzipFile::Error
|
58
|
+
nil
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -15,7 +15,7 @@ module WebPageParser
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def self.create(options = {})
|
18
|
-
|
18
|
+
BbcNewsPageParserV5.new(options)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -23,7 +23,7 @@ module WebPageParser
|
|
23
23
|
# old News Sniffer BbcNewsPage class did. This should only ever
|
24
24
|
# be used for backwards compatability with News Sniffer and is
|
25
25
|
# never supplied for use by a factory.
|
26
|
-
class BbcNewsPageParserV1 < WebPageParser::
|
26
|
+
class BbcNewsPageParserV1 < WebPageParser::BaseRegexpParser
|
27
27
|
|
28
28
|
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
29
29
|
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
@@ -34,11 +34,11 @@ module WebPageParser
|
|
34
34
|
|
35
35
|
def hash
|
36
36
|
# Old News Sniffer only hashed the content, not the title
|
37
|
-
Digest::MD5.hexdigest(content.to_s)
|
37
|
+
Digest::MD5.hexdigest(content.join('').to_s)
|
38
38
|
end
|
39
39
|
|
40
40
|
private
|
41
|
-
|
41
|
+
|
42
42
|
def date_processor
|
43
43
|
begin
|
44
44
|
# OPD is in GMT/UTC, which DateTime seems to use by default
|
@@ -58,7 +58,7 @@ module WebPageParser
|
|
58
58
|
end
|
59
59
|
|
60
60
|
# BbcNewsPageParserV2 parses BBC News web pages
|
61
|
-
class BbcNewsPageParserV2 < WebPageParser::
|
61
|
+
class BbcNewsPageParserV2 < WebPageParser::BaseRegexpParser
|
62
62
|
|
63
63
|
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
64
64
|
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
@@ -96,8 +96,8 @@ module WebPageParser
|
|
96
96
|
CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
|
97
97
|
STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
|
98
98
|
STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
|
99
|
-
|
100
|
-
|
99
|
+
# BBC news is now in utf8
|
100
|
+
|
101
101
|
def content_processor
|
102
102
|
@content = STRIP_FEATURES_RE.gsub(@content, '')
|
103
103
|
@content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
|
@@ -118,7 +118,8 @@ module WebPageParser
|
|
118
118
|
STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
|
119
119
|
STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
|
120
120
|
STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
|
121
|
-
|
121
|
+
STRIP_TWITTER_WIDGET_RE = ORegexp.new('<div[^>]+twitter\-module.*?</ul>','m')
|
122
|
+
STRIP_TWITTER_WIDGET2_RE = ORegexp.new('<ul[^>]+tweets.+?</ul>.+?<ul[^>]+links.+?</ul>', 'm')
|
122
123
|
def content_processor
|
123
124
|
@content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
|
124
125
|
@content = STRIP_STORY_DATE.gsub(@content, '')
|
@@ -131,8 +132,70 @@ module WebPageParser
|
|
131
132
|
@content = STRIP_HYPERPUFF_RE.gsub(@content, '')
|
132
133
|
@content = STRIP_MARKETDATA_RE.gsub(@content, '')
|
133
134
|
@content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
|
135
|
+
@content = STRIP_TWITTER_WIDGET_RE.gsub(@content, '')
|
136
|
+
@content = STRIP_TWITTER_WIDGET2_RE.gsub(@content, '')
|
134
137
|
super
|
135
138
|
end
|
136
139
|
end
|
137
|
-
|
140
|
+
|
141
|
+
|
142
|
+
class BbcNewsPageParserV5 < WebPageParser::BaseParser
|
143
|
+
require 'nokogiri'
|
144
|
+
|
145
|
+
def html_doc
|
146
|
+
@html_document ||= Nokogiri::HTML(page)
|
147
|
+
end
|
148
|
+
|
149
|
+
def title
|
150
|
+
return @title if @title
|
151
|
+
@title = html_doc.css('h1.story-header').text.strip
|
152
|
+
|
153
|
+
# for older bbc articles
|
154
|
+
if @title.empty?
|
155
|
+
@title = html_doc.css('div#meta-information h1').text.strip
|
156
|
+
end
|
157
|
+
|
158
|
+
# for very old bbc articles
|
159
|
+
if @title.empty?
|
160
|
+
if headline_meta = html_doc.at_css('meta[name=Headline]')
|
161
|
+
@title = headline_meta['content'].to_s.strip
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
@title
|
166
|
+
end
|
167
|
+
|
168
|
+
def content
|
169
|
+
return @content if @content
|
170
|
+
@content = []
|
171
|
+
story_body = html_doc.css('div.story-body')
|
172
|
+
|
173
|
+
# for older bbc articles
|
174
|
+
if story_body.children.empty?
|
175
|
+
story_body = html_doc.css('div#story-body')
|
176
|
+
end
|
177
|
+
|
178
|
+
# for very old bbc articles
|
179
|
+
if story_body.children.empty?
|
180
|
+
story_body = html_doc.css('td.storybody')
|
181
|
+
end
|
182
|
+
|
183
|
+
story_body.children.each do |n|
|
184
|
+
@content << n.text.strip if n.name == 'p'
|
185
|
+
@content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
|
186
|
+
end
|
187
|
+
@content
|
188
|
+
end
|
189
|
+
|
190
|
+
def date
|
191
|
+
return @date if @date
|
192
|
+
if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
|
193
|
+
@date = DateTime.parse(date_meta['content']) rescue nil
|
194
|
+
end
|
195
|
+
@date
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
|
138
201
|
end
|