href-preview 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a67ef243d53323e6ece890b63a587c0b6a298db3
4
+ data.tar.gz: 4843646e829511658a361df0fe24a81fea30d512
5
+ SHA512:
6
+ metadata.gz: bd3ea4180c61cab25a689adecc200cded13d3747bcefa1ab6061cbd13dc014e77be8fd2b3cca8cedc17ca4d15a5d9b998cb9a2f83d1e4abc6950de98fe3b9389
7
+ data.tar.gz: 88b207d8bcc1b6a5574e0987bd717257d1b90c98b48277f09d53ae282cfcbd17f8c34db9d783e35322569048901546387e4179d46e447d545e97a40b839163e1
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ == 0.1.0
2
+
3
+ * initial release
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # dependencies go here
4
+
5
+ group :development do
6
+ gem 'launchy', '>= 2.1.1'
7
+ gem 'yard'
8
+ gem 'kramdown'
9
+ end
10
+
11
+ group :test, :development do
12
+ gem 'rake', '>= 0.9.0'
13
+ gem 'rspec', '>= 2.11.0'
14
+ gem 'rcov', '>= 0.9.9', :platform => :mri_18
15
+ end
data/LICENSE ADDED
@@ -0,0 +1,202 @@
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # HREF Preview
2
+
3
+ <dl>
4
+ <dt>Homepage</dt><dd><a href="https://github.com/sporkmonger/href-preview">https://github.com/sporkmonger/href-preview</a></dd>
5
+ <dt>Author</dt><dd><a href="mailto:bob@sporkmonger.com">Bob Aman</a></dd>
6
+ <dt>Copyright</dt><dd>Copyright © 2014 Bob Aman</dd>
7
+ <dt>License</dt><dd>Apache 2.0</dd>
8
+ </dl>
9
+
10
+ # Description
11
+
12
+ A comprehensive library for efficiently previewing links.
13
+
14
+ # Features
15
+
16
+ Capable of detecting:
17
+ * OpenGraph metadata
18
+ * Twitter metadata
19
+ * Microdata metadata
20
+ * Assorted microformat metadata
21
+ * RDFa metadata
22
+ * rel="canonical" links
23
+
24
+ # Example Usage
25
+
26
+ require 'href_preview'
27
+ p = HRefPreview.open('http://nyti.ms/1c1zNtX')
28
+ p.title
29
+ # => "A Successor to Sagan Reboots ‘Cosmos’"
30
+ p.description
31
+ # =>
32
+ p.article_html
33
+ p.article_text
34
+ p.published
35
+ p.canonical_uri
36
+
37
+ # Install
38
+
39
+ * gem install href-preview
data/Rakefile ADDED
@@ -0,0 +1,42 @@
1
+ lib_dir = File.expand_path(File.join(File.dirname(__FILE__), 'lib'))
2
+ $:.unshift(lib_dir)
3
+ $:.uniq!
4
+
5
+ require 'rubygems'
6
+ require 'rake'
7
+
8
+ require File.join(File.dirname(__FILE__), 'lib/href_preview', 'version')
9
+
10
+ PKG_DISPLAY_NAME = 'HREF Preview'
11
+ PKG_NAME = PKG_DISPLAY_NAME.gsub(/ /, '-').downcase
12
+ PKG_VERSION = HRefPreview::VERSION::STRING
13
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
14
+
15
+ RELEASE_NAME = "REL #{PKG_VERSION}"
16
+ GIT_HUB_URL = "https://github.com/sporkmonger/href-preview"
17
+
18
+ PKG_AUTHOR = 'Bob Aman'
19
+ PKG_AUTHOR_EMAIL = 'bob@sporkmonger.com'
20
+ PKG_HOMEPAGE = GIT_HUB_URL
21
+ PKG_SUMMARY = 'Package Summary'
22
+ PKG_DESCRIPTION = <<-TEXT
23
+ A simple library for efficiently previewing links.
24
+ TEXT
25
+
26
+ PKG_FILES = FileList[
27
+ 'lib/**/*', 'spec/**/*', 'vendor/**/*',
28
+ 'tasks/**/*', 'website/**/*',
29
+ '[A-Z]*', 'Rakefile'
30
+ ].exclude(/database\.yml/).exclude(/[_\.]git$/).exclude(/Gemfile\.lock/)
31
+
32
+ RCOV_ENABLED = (RUBY_PLATFORM != 'java' && RUBY_VERSION =~ /^1\.8/)
33
+ if RCOV_ENABLED
34
+ task :default => 'spec:rcov'
35
+ else
36
+ task :default => 'spec'
37
+ end
38
+
39
+ WINDOWS = (RUBY_PLATFORM =~ /mswin|win32|mingw|bccwin|cygwin/) rescue false
40
+ SUDO = WINDOWS ? '' : ('sudo' unless ENV['SUDOLESS'])
41
+
42
+ Dir['tasks/**/*.rake'].each { |rake| load rake }
@@ -0,0 +1,17 @@
1
+ # Copyright 2014 Bob Aman
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ # Just a require alias.
17
+ require 'href_preview'
@@ -0,0 +1,34 @@
1
+ # Copyright 2014 Bob Aman
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require 'addressable/uri'
17
+ require 'faraday'
18
+ require 'faraday_middleware'
19
+ require 'href_preview/version'
20
+ require 'href_preview/preview'
21
+ require 'href_preview/faraday_common_request'
22
+
23
+ module HRefPreview
24
+ DEFAULT_CONNECTION = Faraday.new do |connection|
25
+ connection.use FaradayMiddleware::FollowRedirects, {:limit => 5}
26
+ connection.use Faraday::CommonRequest
27
+ connection.adapter :httpclient
28
+ end
29
+
30
+ def self.open(uri, connection=DEFAULT_CONNECTION)
31
+ response = connection.get(Addressable::URI.parse(uri))
32
+ return HRefPreview::Preview.new(response, connection)
33
+ end
34
+ end
@@ -0,0 +1,40 @@
1
+ # Copyright 2014 Bob Aman
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ module Faraday
17
+ class CommonRequest < Faraday::Middleware
18
+ def initialize(app, *args)
19
+ @app = app
20
+ @options = args.shift || {}
21
+ @options.merge!(
22
+ :user_agent => (
23
+ "Mozilla/5.0 (compatible; " +
24
+ "HRefPreview/#{HRefPreview::VERSION::STRING}; " +
25
+ "+https://github.com/sporkmonger/href-preview)"
26
+ ),
27
+ :accept => "*/*"
28
+ )
29
+ @user_agent = @options[:user_agent]
30
+ @accept = @options[:accept]
31
+ end
32
+
33
+ def call(env)
34
+ env[:request_headers].merge!('User-Agent' => @user_agent)
35
+ env[:request_headers].merge!('Accept' => @accept)
36
+ response = @app.call env
37
+ response
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,20 @@
1
+ # Copyright 2014 Bob Aman
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require 'fastimage'
17
+
18
+ class FastImage
19
+ attr_reader :uri
20
+ end
@@ -0,0 +1,451 @@
1
+ # Copyright 2014 Bob Aman
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require 'mime/types'
17
+ require 'faraday'
18
+ require 'nokogiri'
19
+ require 'sanitize'
20
+ require 'fastimage'
21
+ require 'href_preview/fastimage_uri'
22
+ require 'time'
23
+
24
+ module HRefPreview
25
+ class Preview
26
+ ##
27
+ # Initializes a `Preview` from an HTTP response.
28
+ def initialize(response, connection=DEFAULT_CONNECTION)
29
+ @response = response
30
+ @connection = connection
31
+ end
32
+
33
+ attr_reader :response
34
+
35
+ ##
36
+ # Returns the MIME type declared in the HTTP headers or HTML meta
37
+ # tags.
38
+ #
39
+ # @return [MIME::Type] The MIME type of the HTTP response.
40
+ def mime_type
41
+ @mime_type ||= (begin
42
+ MIME::Types[response.headers['Content-Type']].first or
43
+ begin
44
+ node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
45
+ MIME::Types[node.value].first if node && node.value
46
+ end or
47
+ begin
48
+ node = dom.xpath("//*/meta[@name='dc.format']/@content").first
49
+ MIME::Types[node.value].first if node && node.value
50
+ end
51
+ end)
52
+ end
53
+
54
+ ##
55
+ # Returns the charset declared in the HTTP headers or HTML meta
56
+ # tags.
57
+ #
58
+ # @return [String] The charset of the HTTP response.
59
+ def charset
60
+ @charset ||= (begin
61
+ charset = response.headers['Content-Type'].to_s[/;\s*charset=([^;,]*)/, 1] or
62
+ begin
63
+ node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
64
+ node.value.to_s[/;\s*charset=([^;,]*)/, 1] if node
65
+ end or
66
+ begin
67
+ node = dom.xpath("//*/meta/@charset").first
68
+ node.value if node
69
+ end
70
+ charset.strip if charset
71
+ end)
72
+ end
73
+
74
+ ##
75
+ # @returns [String] The two-letter language code for the content.
76
+ def language
77
+ @language ||= (begin
78
+ language = response.headers['Content-Language'] or
79
+ begin
80
+ node = dom.xpath("//*/meta[@http-equiv='Content-Language']/@content").first
81
+ node.value if node
82
+ end or
83
+ begin
84
+ node = dom.xpath("//*/meta[@name='dc.language']/@content").first
85
+ node.value if node
86
+ end
87
+ if language
88
+ # Strip the irrelevant '-US' from 'en-US' if it appears.
89
+ language[/^([a-z]{2})/, 1].to_s.downcase
90
+ end
91
+ end)
92
+ end
93
+
94
+ ##
95
+ # Returns true if the response had a 2xx HTTP code and the mime type
96
+ # is either HTML or XHTML.
97
+ #
98
+ # @return [true, false] Whether successful HTML response or not.
99
+ def is_html?
100
+ return (
101
+ response.status >= 200 && response.status < 300 &&
102
+ mime_type && mime_type.sub_type =~ /^x?html/
103
+ )
104
+ end
105
+
106
+ ##
107
+ # The DOM for the response body.
108
+ #
109
+ # @return [Nokogiri::HTML::Document]
110
+ # The DOM, as generated by Nokogiri.
111
+ def dom
112
+ @dom ||= Nokogiri::HTML(response.body)
113
+ end
114
+
115
+ ##
116
+ # @return [String] The title of the page.
117
+ def title
118
+ @title ||= (begin
119
+ if is_html?
120
+ title = begin
121
+ node = dom.xpath("//*/meta[@property='og:title']/@content").first
122
+ node.value if node
123
+ end or
124
+ begin
125
+ node = dom.xpath("//*/meta[@name='dc.title']/@content").first
126
+ node.value if node
127
+ end or
128
+ begin
129
+ if article_node
130
+ node = article_node.xpath("*[@itemprop='headline']").first
131
+ node.text if node
132
+ end
133
+ end or
134
+ begin
135
+ node = dom.xpath("//*/*[(self::h1 or self::h2) and @itemprop='headline']").first
136
+ node.text if node
137
+ end or
138
+ begin
139
+ node = dom.xpath("//*/head/title").first
140
+ node.text if node
141
+ end or
142
+ begin
143
+ # Unlikely to ever happen
144
+ node = dom.xpath("//*/meta[@name='twitter:title']/@content").first
145
+ node.value if node
146
+ end or
147
+ begin
148
+ # Unlikely to ever happen
149
+ node = dom.xpath("//*/meta[@name='sailthru.title']/@content").first
150
+ node.value if node
151
+ end
152
+ if title
153
+ title.gsub!(/&nbsp;/, ' ')
154
+ title.gsub!(/^#{site_name}[\s\|\-\:]*/, '')
155
+ title.gsub!(/[\s\|\-\:]*#{site_name}$/, '')
156
+ title.strip
157
+ end
158
+ end
159
+ end)
160
+ end
161
+
162
+ def description
163
+ @description ||= (begin
164
+ if is_html?
165
+ description = begin
166
+ node = dom.xpath("//*/meta[@property='og:description']/@content").first
167
+ node.value if node
168
+ end or
169
+ begin
170
+ node = dom.xpath("//*/meta[@name='dc.description']/@content").first
171
+ node.value if node
172
+ end or
173
+ begin
174
+ node = dom.xpath("//*/meta[@itemprop='description']/@content").first
175
+ node.value if node
176
+ end or
177
+ begin
178
+ node = dom.xpath("//*/meta[@name='description']/@content").first
179
+ node.value if node
180
+ end or
181
+ begin
182
+ node = dom.xpath("//*/meta[@name='dcterms.abstract']/@content").first
183
+ node.value if node
184
+ end or
185
+ begin
186
+ # Unlikely to ever happen
187
+ node = dom.xpath("//*/meta[@name='twitter:description']/@content").first
188
+ node.value if node
189
+ end or
190
+ begin
191
+ # Unlikely to ever happen
192
+ node = dom.xpath("//*/meta[@name='sailthru.description']/@content").first
193
+ node.value if node
194
+ end
195
+ if description
196
+ description.gsub!(/&nbsp;/, ' ')
197
+ description.strip
198
+ end
199
+ end
200
+ end)
201
+ end
202
+
203
+ def canonical_uri
204
+ @canonical_uri ||= (if is_html?
205
+ begin
206
+ node = dom.xpath("//*/link[@rel='canonical']/@href").first
207
+ Addressable::URI.parse(node.value) if node && node.value && node.value != ''
208
+ end or
209
+ begin
210
+ node = dom.xpath("//*/meta[@property='og:url']/@content").first
211
+ Addressable::URI.parse(node.value) if node && node.value && node.value != ''
212
+ end or
213
+ Addressable::URI.parse(response.env.url.to_s)
214
+ else
215
+ Addressable::URI.parse(response.env.url.to_s)
216
+ end)
217
+ end
218
+
219
+ def shortlink_uri
220
+ @shortlink_uri ||= (if is_html?
221
+ begin
222
+ node = dom.xpath("//*/link[@rel='shortlink']/@href").first
223
+ Addressable::URI.parse(node.value) if node && node.value && node.value != ''
224
+ end or
225
+ begin
226
+ node = dom.xpath("//*[@class='story-short-url']/a/@href").first
227
+ Addressable::URI.parse(node.value) if node && node.value && node.value != ''
228
+ end
229
+ end)
230
+ end
231
+
232
+ def image_uri
233
+ @image_uri ||= (images.first ? Addressable::URI.parse(images.first.uri) : nil)
234
+ end
235
+
236
+ def images
237
+ @images ||= (begin
238
+ image_uris = []
239
+ if is_html?
240
+ nodes = dom.xpath("//*/meta[@property='og:image']/@content")
241
+ nodes.each do |node|
242
+ if node && node.value && node.value != ''
243
+ image_uris << Addressable::URI.parse(node.value)
244
+ end
245
+ end
246
+ if article_node
247
+ nodes = article_node.xpath("meta[@itemprop='thumbnailurl']/@content")
248
+ nodes.each do |node|
249
+ if node && node.value && node.value != ''
250
+ image_uris << Addressable::URI.parse(node.value)
251
+ end
252
+ end
253
+ end
254
+ elsif mime_type && mime_type.media_type == 'image'
255
+ image_uris << canonical_uri
256
+ end
257
+ image_uris.uniq.map { |uri| FastImage.new(uri, :timeout => 0.5) }
258
+ end)
259
+ end
260
+
261
+ def item_type
262
+ @item_type ||= (if is_html?
263
+ begin
264
+ node = dom.xpath("//*/meta[@property='og:type']/@content").first
265
+ node.value if node
266
+ end or
267
+ if dom.xpath("//*[@itemtype='http://schema.org/NewsArticle']").first != nil
268
+ 'article'
269
+ end
270
+ end)
271
+ end
272
+
273
+ def site_name
274
+ @site_name ||= (if is_html?
275
+ begin
276
+ node = dom.xpath("//*/meta[@property='og:site_name']/@content").first
277
+ node.value if node
278
+ end or
279
+ begin
280
+ node = dom.xpath("//*/meta[@name='dc.publisher']/@content").first
281
+ node.value if node
282
+ end
283
+ end)
284
+ end
285
+
286
+ ##
287
+ # @return [String] The Twitter handle used by the site.
288
+ def twitter
289
+ @twitter ||= (if is_html?
290
+ begin
291
+ node = dom.xpath("//*/meta[@name='twitter:site']/@content").first
292
+ node.value if node && node.value && node.value =~ /^@/
293
+ end
294
+ end)
295
+ end
296
+
297
+ def article_node
298
+ @article_node ||= (if is_html?
299
+ begin
300
+ nodes = dom.xpath("/html[@itemtype='http://schema.org/NewsArticle']//article[@id='story']")
301
+ nodes.first if nodes.size == 1
302
+ end or
303
+ begin
304
+ nodes = dom.xpath("//*/*[@itemtype='http://schema.org/NewsArticle']")
305
+ nodes.first if nodes.size == 1
306
+ end or
307
+ begin
308
+ nodes = dom.xpath("//*/*[@itemprop='articleBody']")
309
+ nodes.first if nodes.size == 1
310
+ end or
311
+ begin
312
+ nodes = dom.css("article div.article-entry")
313
+ nodes.first if nodes.size == 1
314
+ end or
315
+ begin
316
+ nodes = dom.css("article.post div.entry-content")
317
+ nodes.first if nodes.size == 1
318
+ end or
319
+ begin
320
+ nodes = dom.css("div.post div.postBody")
321
+ nodes.first if nodes.size == 1
322
+ end or
323
+ begin
324
+ nodes = dom.css(".pg_story div#leftcolumn div.body")
325
+ nodes.first if nodes.size == 1
326
+ end
327
+ end)
328
+ end
329
+
330
+ options = Sanitize::Config::RELAXED.merge(
331
+ :remove_contents => true,
332
+ :elements => %w[
333
+ a abbr address b bdi bdo blockquote br caption cite code col colgroup dd
334
+ del dfn dl dt em figcaption figure h1 h2 h3 h4 h5 h6 hgroup hr i img ins
335
+ kbd li mark ol p pre q rp rt ruby s samp small span strike strong sub
336
+ summary sup table tbody td tfoot th thead time tr u ul var wbr
337
+ ]
338
+ )
339
+ options[:attributes]['span'] = []
340
+ SANITIZE_OPTIONS = options
341
+
342
+ def article_html
343
+ @article_html ||= (if is_html?
344
+ begin
345
+ html = nil
346
+ if article_node
347
+ html = article_node.children.reject do |child|
348
+ next unless child.attribute('class')
349
+ [
350
+ 'related_links_inline',
351
+ 'inline-share-btn-label',
352
+ 'inline-share-btn'
353
+ ].include?(child.attribute('class').value)
354
+ end.map(&:to_s).join('')
355
+ end
356
+ if html
357
+ html = Sanitize.clean(html, SANITIZE_OPTIONS)
358
+ html.gsub!("\r\n", "\n")
359
+ html.gsub!("\t", " ")
360
+ html.gsub!(/ *\n */, "\n")
361
+ html.gsub!(/\n\n+/, "\n\n")
362
+ html.gsub!(/<p>\n+/, "<p>\n")
363
+ html.gsub!(/\n+<\/p>/, "\n</p>")
364
+ html.gsub!(/<\/p>\n+/, "</p>\n")
365
+ html.strip!
366
+
367
+ # Excise empty elements
368
+ reparsed = Nokogiri::HTML.fragment(html)
369
+ excise_empty = lambda do |node|
370
+ if node.respond_to?(:name) && node.name == "script"
371
+ node.unlink
372
+ else
373
+ node.children.each do |node|
374
+ excise_empty.call(node) if node.element?
375
+ end
376
+ if node.respond_to?(:attribute_nodes) && node.respond_to?(:text)
377
+ if node.attribute_nodes.size == 0 && node.text.to_s.strip =~ /^\s*$/ &&
378
+ node.children.all? { |child| child.text? }
379
+ node.unlink
380
+ end
381
+ end
382
+ end
383
+ end
384
+ excise_empty.call(reparsed)
385
+ html = reparsed.to_s
386
+ end
387
+ html
388
+ end
389
+ end)
390
+ end
391
+
392
+ def article_text
393
+ @article_text ||= is_html? ? Sanitize.clean(article_html) : nil
394
+ end
395
+
396
+ def published
397
+ @published ||= (begin
398
+ # Check under the article node first, otherwise search all
399
+ begin
400
+ node = dom.xpath("//*/meta[@property='article:published_time']/@content").first
401
+ Time.parse(node.value) if node && node.value && node.value != ''
402
+ end or
403
+ if article_node
404
+ node = article_node.xpath("meta[@itemprop='datepublished']/@content").first
405
+ Time.parse(node.value) if node && node.value && node.value != ''
406
+ end or
407
+ begin
408
+ node = dom.xpath("//*/meta[@itemprop='datepublished']/@content").first
409
+ Time.parse(node.value) if node && node.value && node.value != ''
410
+ end or
411
+ begin
412
+ node = dom.xpath("//*/meta[@name='dcterms.created']/@content").first
413
+ Time.parse(node.value) if node && node.value && node.value != ''
414
+ end or
415
+ begin
416
+ # Only a date, not a time, and not particularly specific,
417
+ # so this is a fallback at best.
418
+ node = dom.xpath("//*/meta[@name='dc.date']/@content").first
419
+ Time.parse(node.value) if node && node.value && node.value != ''
420
+ end
421
+ end)
422
+ end
423
+
424
+ def updated
425
+ @updated ||= (begin
426
+ # Check under the article node first, otherwise search all
427
+ begin
428
+ node = dom.xpath("//*/meta[@property='article:modified_time']/@content").first
429
+ Time.parse(node.value) if node && node.value && node.value != ''
430
+ end or
431
+ if article_node
432
+ node = article_node.xpath("meta[@itemprop='datemodified']/@content").first
433
+ Time.parse(node.value) if node && node.value && node.value != ''
434
+ end or
435
+ begin
436
+ node = dom.xpath("meta[@itemprop='datemodified']/@content").first
437
+ Time.parse(node.value) if node && node.value && node.value != ''
438
+ end or
439
+ begin
440
+ node = dom.xpath("//*/meta[@name='dcterms.modified']/@content").first
441
+ Time.parse(node.value) if node && node.value && node.value != ''
442
+ end
443
+ end)
444
+ end
445
+
446
+ def inspect
447
+ addr = '0x' + ('%x' % (object_id << 1)).rjust(14, '0')
448
+ "#<HRefPreview::Preview:#{addr} TITLE=#{title.inspect}>"
449
+ end
450
+ end
451
+ end