ruby-readability-discourse 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/Guardfile +9 -0
- data/LICENSE +202 -0
- data/README.md +107 -0
- data/Rakefile +6 -0
- data/bin/readability +41 -0
- data/lib/readability.rb +492 -0
- data/lib/ruby-readability.rb +1 -0
- data/ruby-readability.gemspec +25 -0
- data/spec/fixtures/bbc.html +2069 -0
- data/spec/fixtures/boing_boing.html +876 -0
- data/spec/fixtures/cant_read.html +426 -0
- data/spec/fixtures/code.html +13 -0
- data/spec/fixtures/images/dim_1416768a.jpg +0 -0
- data/spec/fixtures/images/sign_up_emails_682__703711a.gif +0 -0
- data/spec/fixtures/images/sign_up_emails_682__703712a.gif +0 -0
- data/spec/fixtures/nytimes.html +58 -0
- data/spec/fixtures/sample.html +1198 -0
- data/spec/fixtures/samples/blogpost_with_links-fragments.rb +10 -0
- data/spec/fixtures/samples/blogpost_with_links.html +137 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +13 -0
- data/spec/fixtures/samples/channel4-1.html +1330 -0
- data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
- data/spec/fixtures/samples/foxnews-india1.html +2058 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
- data/spec/fixtures/should_not_truncate.txt +1077 -0
- data/spec/fixtures/thesun.html +1122 -0
- data/spec/readability_spec.rb +544 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +11 -0
- metadata +152 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b35bc58da2245b0dda58c335c4cfea4551128c01
|
4
|
+
data.tar.gz: ae3c1969f961e4a567dfe27a7fe76ae7d280b637
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d31aeecfd5bff5eaac11bc9d766997b8eb0a15d87d809ebe8619e73ddf7edbb2a59cb837fcc321c3db69271219a841a37e63949bc4a468d690c593fdea61dc78
|
7
|
+
data.tar.gz: 5938c50037f15006cc8be537f420e6d29f5ff0b6f3d0419a9debce85f6f7014566eba82b53653cdcf59e95f5ee955a7f4b73f880b6b0b723c3a4df503e89f27a
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
|
2
|
+
Apache License
|
3
|
+
Version 2.0, January 2004
|
4
|
+
http://www.apache.org/licenses/
|
5
|
+
|
6
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
7
|
+
|
8
|
+
1. Definitions.
|
9
|
+
|
10
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
11
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
12
|
+
|
13
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
14
|
+
the copyright owner that is granting the License.
|
15
|
+
|
16
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
17
|
+
other entities that control, are controlled by, or are under common
|
18
|
+
control with that entity. For the purposes of this definition,
|
19
|
+
"control" means (i) the power, direct or indirect, to cause the
|
20
|
+
direction or management of such entity, whether by contract or
|
21
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
22
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
23
|
+
|
24
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
25
|
+
exercising permissions granted by this License.
|
26
|
+
|
27
|
+
"Source" form shall mean the preferred form for making modifications,
|
28
|
+
including but not limited to software source code, documentation
|
29
|
+
source, and configuration files.
|
30
|
+
|
31
|
+
"Object" form shall mean any form resulting from mechanical
|
32
|
+
transformation or translation of a Source form, including but
|
33
|
+
not limited to compiled object code, generated documentation,
|
34
|
+
and conversions to other media types.
|
35
|
+
|
36
|
+
"Work" shall mean the work of authorship, whether in Source or
|
37
|
+
Object form, made available under the License, as indicated by a
|
38
|
+
copyright notice that is included in or attached to the work
|
39
|
+
(an example is provided in the Appendix below).
|
40
|
+
|
41
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
42
|
+
form, that is based on (or derived from) the Work and for which the
|
43
|
+
editorial revisions, annotations, elaborations, or other modifications
|
44
|
+
represent, as a whole, an original work of authorship. For the purposes
|
45
|
+
of this License, Derivative Works shall not include works that remain
|
46
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
47
|
+
the Work and Derivative Works thereof.
|
48
|
+
|
49
|
+
"Contribution" shall mean any work of authorship, including
|
50
|
+
the original version of the Work and any modifications or additions
|
51
|
+
to that Work or Derivative Works thereof, that is intentionally
|
52
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
53
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
54
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
55
|
+
means any form of electronic, verbal, or written communication sent
|
56
|
+
to the Licensor or its representatives, including but not limited to
|
57
|
+
communication on electronic mailing lists, source code control systems,
|
58
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
59
|
+
Licensor for the purpose of discussing and improving the Work, but
|
60
|
+
excluding communication that is conspicuously marked or otherwise
|
61
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
62
|
+
|
63
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
64
|
+
on behalf of whom a Contribution has been received by Licensor and
|
65
|
+
subsequently incorporated within the Work.
|
66
|
+
|
67
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
68
|
+
this License, each Contributor hereby grants to You a perpetual,
|
69
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
70
|
+
copyright license to reproduce, prepare Derivative Works of,
|
71
|
+
publicly display, publicly perform, sublicense, and distribute the
|
72
|
+
Work and such Derivative Works in Source or Object form.
|
73
|
+
|
74
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
75
|
+
this License, each Contributor hereby grants to You a perpetual,
|
76
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
77
|
+
(except as stated in this section) patent license to make, have made,
|
78
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
79
|
+
where such license applies only to those patent claims licensable
|
80
|
+
by such Contributor that are necessarily infringed by their
|
81
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
82
|
+
with the Work to which such Contribution(s) was submitted. If You
|
83
|
+
institute patent litigation against any entity (including a
|
84
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
85
|
+
or a Contribution incorporated within the Work constitutes direct
|
86
|
+
or contributory patent infringement, then any patent licenses
|
87
|
+
granted to You under this License for that Work shall terminate
|
88
|
+
as of the date such litigation is filed.
|
89
|
+
|
90
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
91
|
+
Work or Derivative Works thereof in any medium, with or without
|
92
|
+
modifications, and in Source or Object form, provided that You
|
93
|
+
meet the following conditions:
|
94
|
+
|
95
|
+
(a) You must give any other recipients of the Work or
|
96
|
+
Derivative Works a copy of this License; and
|
97
|
+
|
98
|
+
(b) You must cause any modified files to carry prominent notices
|
99
|
+
stating that You changed the files; and
|
100
|
+
|
101
|
+
(c) You must retain, in the Source form of any Derivative Works
|
102
|
+
that You distribute, all copyright, patent, trademark, and
|
103
|
+
attribution notices from the Source form of the Work,
|
104
|
+
excluding those notices that do not pertain to any part of
|
105
|
+
the Derivative Works; and
|
106
|
+
|
107
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
108
|
+
distribution, then any Derivative Works that You distribute must
|
109
|
+
include a readable copy of the attribution notices contained
|
110
|
+
within such NOTICE file, excluding those notices that do not
|
111
|
+
pertain to any part of the Derivative Works, in at least one
|
112
|
+
of the following places: within a NOTICE text file distributed
|
113
|
+
as part of the Derivative Works; within the Source form or
|
114
|
+
documentation, if provided along with the Derivative Works; or,
|
115
|
+
within a display generated by the Derivative Works, if and
|
116
|
+
wherever such third-party notices normally appear. The contents
|
117
|
+
of the NOTICE file are for informational purposes only and
|
118
|
+
do not modify the License. You may add Your own attribution
|
119
|
+
notices within Derivative Works that You distribute, alongside
|
120
|
+
or as an addendum to the NOTICE text from the Work, provided
|
121
|
+
that such additional attribution notices cannot be construed
|
122
|
+
as modifying the License.
|
123
|
+
|
124
|
+
You may add Your own copyright statement to Your modifications and
|
125
|
+
may provide additional or different license terms and conditions
|
126
|
+
for use, reproduction, or distribution of Your modifications, or
|
127
|
+
for any such Derivative Works as a whole, provided Your use,
|
128
|
+
reproduction, and distribution of the Work otherwise complies with
|
129
|
+
the conditions stated in this License.
|
130
|
+
|
131
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
132
|
+
any Contribution intentionally submitted for inclusion in the Work
|
133
|
+
by You to the Licensor shall be under the terms and conditions of
|
134
|
+
this License, without any additional terms or conditions.
|
135
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
136
|
+
the terms of any separate license agreement you may have executed
|
137
|
+
with Licensor regarding such Contributions.
|
138
|
+
|
139
|
+
6. Trademarks. This License does not grant permission to use the trade
|
140
|
+
names, trademarks, service marks, or product names of the Licensor,
|
141
|
+
except as required for reasonable and customary use in describing the
|
142
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
143
|
+
|
144
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
145
|
+
agreed to in writing, Licensor provides the Work (and each
|
146
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
147
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
148
|
+
implied, including, without limitation, any warranties or conditions
|
149
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
150
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
151
|
+
appropriateness of using or redistributing the Work and assume any
|
152
|
+
risks associated with Your exercise of permissions under this License.
|
153
|
+
|
154
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
155
|
+
whether in tort (including negligence), contract, or otherwise,
|
156
|
+
unless required by applicable law (such as deliberate and grossly
|
157
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
158
|
+
liable to You for damages, including any direct, indirect, special,
|
159
|
+
incidental, or consequential damages of any character arising as a
|
160
|
+
result of this License or out of the use or inability to use the
|
161
|
+
Work (including but not limited to damages for loss of goodwill,
|
162
|
+
work stoppage, computer failure or malfunction, or any and all
|
163
|
+
other commercial damages or losses), even if such Contributor
|
164
|
+
has been advised of the possibility of such damages.
|
165
|
+
|
166
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
167
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
168
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
169
|
+
or other liability obligations and/or rights consistent with this
|
170
|
+
License. However, in accepting such obligations, You may act only
|
171
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
172
|
+
of any other Contributor, and only if You agree to indemnify,
|
173
|
+
defend, and hold each Contributor harmless for any liability
|
174
|
+
incurred by, or claims asserted against, such Contributor by reason
|
175
|
+
of your accepting any such warranty or additional liability.
|
176
|
+
|
177
|
+
END OF TERMS AND CONDITIONS
|
178
|
+
|
179
|
+
APPENDIX: How to apply the Apache License to your work.
|
180
|
+
|
181
|
+
To apply the Apache License to your work, attach the following
|
182
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
183
|
+
replaced with your own identifying information. (Don't include
|
184
|
+
the brackets!) The text should be enclosed in the appropriate
|
185
|
+
comment syntax for the file format. We also recommend that a
|
186
|
+
file or class name and description of purpose be included on the
|
187
|
+
same "printed page" as the copyright notice for easier
|
188
|
+
identification within third-party archives.
|
189
|
+
|
190
|
+
Copyright [yyyy] [name of copyright owner]
|
191
|
+
|
192
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
193
|
+
you may not use this file except in compliance with the License.
|
194
|
+
You may obtain a copy of the License at
|
195
|
+
|
196
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
197
|
+
|
198
|
+
Unless required by applicable law or agreed to in writing, software
|
199
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
200
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
201
|
+
See the License for the specific language governing permissions and
|
202
|
+
limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
Ruby Readability
|
2
|
+
================
|
3
|
+
|
4
|
+
Ruby Readability is a tool for extracting the primary readable content of a
|
5
|
+
webpage. It is a Ruby port of arc90's readability project.
|
6
|
+
|
7
|
+
Build Status
|
8
|
+
------------
|
9
|
+
|
10
|
+
[![Build Status](https://travis-ci.org/cantino/ruby-readability.png)](https://travis-ci.org/cantino/ruby-readability)
|
11
|
+
|
12
|
+
Install
|
13
|
+
-------
|
14
|
+
|
15
|
+
Command line:
|
16
|
+
|
17
|
+
(sudo) gem install ruby-readability
|
18
|
+
|
19
|
+
Bundler:
|
20
|
+
|
21
|
+
gem "ruby-readability", :require => 'readability'
|
22
|
+
|
23
|
+
|
24
|
+
Example
|
25
|
+
-------
|
26
|
+
|
27
|
+
require 'rubygems'
|
28
|
+
require 'readability'
|
29
|
+
require 'open-uri'
|
30
|
+
|
31
|
+
source = open('http://lab.arc90.com/experiments/readability/').read
|
32
|
+
puts Readability::Document.new(source).content
|
33
|
+
|
34
|
+
|
35
|
+
Options
|
36
|
+
-------
|
37
|
+
|
38
|
+
You may provide options to `Readability::Document.new`, including:
|
39
|
+
|
40
|
+
* `:tags`: the base whitelist of tags to sanitize, defaults to `%w[div p]`;
|
41
|
+
* `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
|
42
|
+
removes `<p>` tags that contain only images;
|
43
|
+
* `:attributes`: whitelist of allowed attributes;
|
44
|
+
* `:debug`: provide debugging output, defaults false;
|
45
|
+
* `:encoding`: if the page is of a known encoding, you can specify it; if left
|
46
|
+
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
|
47
|
+
to disable guessing, supply `:do_not_guess_encoding => true`;
|
48
|
+
* `:html_headers`: in Ruby 1.9.x these will be passed to the
|
49
|
+
`guess_html_encoding` gem to aid with guessing the HTML encoding;
|
50
|
+
* `:ignore_image_format`: for use with .images. For example:
|
51
|
+
`:ignore_image_format => ["gif", "png"]`;
|
52
|
+
* `:min_image_height`: set a minimum image height for `#images`;
|
53
|
+
* `:min_image_width`: set a minimum image width for `#images`.
|
54
|
+
|
55
|
+
|
56
|
+
Command Line Tool
|
57
|
+
-----------------
|
58
|
+
|
59
|
+
Readability comes with a command-line tool for experimentation in
|
60
|
+
`bin/readability`.
|
61
|
+
|
62
|
+
Usage: readability [options] URL
|
63
|
+
-d, --debug Show debug output
|
64
|
+
-i, --images Keep images and links
|
65
|
+
-h, --help Show this message
|
66
|
+
|
67
|
+
|
68
|
+
Images
|
69
|
+
------
|
70
|
+
|
71
|
+
You can get a list of images in the content area with `Document#images`. This
|
72
|
+
feature requires that the `fastimage` gem be installed.
|
73
|
+
|
74
|
+
rbody = Readability::Document.new(body, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false)
|
75
|
+
rbody.images
|
76
|
+
|
77
|
+
Related Projects
|
78
|
+
----------------
|
79
|
+
|
80
|
+
* [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
|
81
|
+
|
82
|
+
Potential Issues
|
83
|
+
----------------
|
84
|
+
|
85
|
+
If you're on a Mac and are getting segmentation faults, see the discussion at
|
86
|
+
<https://github.com/sparklemotion/nokogiri/issues/404> and consider updating
|
87
|
+
your version of `libxml2`. Version 2.7.8 of `libxml2`, installed with `brew`,
|
88
|
+
worked for me:
|
89
|
+
|
90
|
+
gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
91
|
+
|
92
|
+
Or if you're using bundler and Rails 3, you can run this command to make
|
93
|
+
bundler always globally build `nokogiri` this way:
|
94
|
+
|
95
|
+
bundle config build.nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
96
|
+
|
97
|
+
|
98
|
+
License
|
99
|
+
-------
|
100
|
+
|
101
|
+
This code is under the Apache License 2.0. See <http://www.apache.org/licenses/LICENSE-2.0>.
|
102
|
+
|
103
|
+
Ruby port by cantino, starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
|
104
|
+
|
105
|
+
|
106
|
+
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/cantino/ruby-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
|
107
|
+
|
data/Rakefile
ADDED
data/bin/readability
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'optparse'
|
5
|
+
require File.dirname(__FILE__) + '/../lib/readability'
|
6
|
+
|
7
|
+
options = { :debug => false, :images => false }
|
8
|
+
options_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: #{File.basename($0)} [options] URL"
|
10
|
+
|
11
|
+
opts.on("-d", "--debug", "Show debug output") do |v|
|
12
|
+
options[:debug] = v
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on("-i", "--images", "Keep images and links") do |i|
|
16
|
+
options[:images] = i
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
20
|
+
puts opts
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
end
|
24
|
+
options_parser.parse!
|
25
|
+
|
26
|
+
if ARGV.length != 1
|
27
|
+
STDERR.puts options_parser
|
28
|
+
exit 1
|
29
|
+
end
|
30
|
+
|
31
|
+
text = open(ARGV.first).read
|
32
|
+
params = if options[:images]
|
33
|
+
{ :tags => %w[div p img a],
|
34
|
+
:attributes => %w[src href],
|
35
|
+
:remove_empty_nodes => false,
|
36
|
+
:debug => options[:debug] }
|
37
|
+
else
|
38
|
+
{ :debug => options[:debug] }
|
39
|
+
end
|
40
|
+
|
41
|
+
puts Readability::Document.new(text, params).content
|
data/lib/readability.rb
ADDED
@@ -0,0 +1,492 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'guess_html_encoding'
|
6
|
+
|
7
|
+
module Readability
|
8
|
+
class Document
|
9
|
+
DEFAULT_OPTIONS = {
|
10
|
+
:retry_length => 250,
|
11
|
+
:min_text_length => 25,
|
12
|
+
:remove_unlikely_candidates => true,
|
13
|
+
:weight_classes => true,
|
14
|
+
:clean_conditionally => true,
|
15
|
+
:remove_empty_nodes => true,
|
16
|
+
:min_image_width => 130,
|
17
|
+
:min_image_height => 80,
|
18
|
+
:ignore_image_format => [],
|
19
|
+
:blacklist => nil,
|
20
|
+
:whitelist => nil
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
24
|
+
|
25
|
+
def initialize(input, options = {})
|
26
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
27
|
+
@input = input
|
28
|
+
|
29
|
+
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
|
30
|
+
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
|
31
|
+
@options[:encoding] = @input.encoding.to_s
|
32
|
+
end
|
33
|
+
|
34
|
+
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
35
|
+
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
|
36
|
+
@weight_classes = @options[:weight_classes]
|
37
|
+
@clean_conditionally = @options[:clean_conditionally]
|
38
|
+
@best_candidate_has_image = true
|
39
|
+
make_html
|
40
|
+
handle_exclusions!(@options[:whitelist], @options[:blacklist])
|
41
|
+
end
|
42
|
+
|
43
|
+
def prepare_candidates
|
44
|
+
@html.css("script, style").each { |i| i.remove }
|
45
|
+
remove_unlikely_candidates! if @remove_unlikely_candidates
|
46
|
+
transform_misused_divs_into_paragraphs!
|
47
|
+
|
48
|
+
@candidates = score_paragraphs(options[:min_text_length])
|
49
|
+
@best_candidate = select_best_candidate(@candidates)
|
50
|
+
end
|
51
|
+
|
52
|
+
def handle_exclusions!(whitelist, blacklist)
|
53
|
+
return unless whitelist || blacklist
|
54
|
+
|
55
|
+
if blacklist
|
56
|
+
elems = @html.css(blacklist)
|
57
|
+
if elems
|
58
|
+
elems.each do |e|
|
59
|
+
e.remove
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
if whitelist
|
65
|
+
elems = @html.css(whitelist).to_s
|
66
|
+
|
67
|
+
if body = @html.at_css('body')
|
68
|
+
body.css('*').each do |e|
|
69
|
+
e.remove
|
70
|
+
end
|
71
|
+
body.inner_html = elems
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
@input = @html.to_s
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
def make_html(whitelist=nil, blacklist=nil)
|
80
|
+
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
|
81
|
+
# In case document has no body, such as from empty string or redirect
|
82
|
+
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
|
83
|
+
|
84
|
+
# Remove html comment tags
|
85
|
+
@html.xpath('//comment()').each { |i| i.remove }
|
86
|
+
end
|
87
|
+
|
88
|
+
def images(content=nil, reload=false)
|
89
|
+
begin
|
90
|
+
require 'fastimage'
|
91
|
+
rescue LoadError
|
92
|
+
raise "Please install fastimage in order to use the #images feature."
|
93
|
+
end
|
94
|
+
|
95
|
+
@best_candidate_has_image = false if reload
|
96
|
+
|
97
|
+
prepare_candidates
|
98
|
+
list_images = []
|
99
|
+
tested_images = []
|
100
|
+
content = @best_candidate[:elem] unless reload
|
101
|
+
|
102
|
+
return list_images if content.nil?
|
103
|
+
elements = content.css("img").map(&:attributes)
|
104
|
+
|
105
|
+
elements.each do |element|
|
106
|
+
next unless element["src"]
|
107
|
+
|
108
|
+
url = element["src"].value
|
109
|
+
height = element["height"].nil? ? 0 : element["height"].value.to_i
|
110
|
+
width = element["width"].nil? ? 0 : element["width"].value.to_i
|
111
|
+
|
112
|
+
if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
|
113
|
+
image = get_image_size(url)
|
114
|
+
next unless image
|
115
|
+
else
|
116
|
+
image = {:width => width, :height => height}
|
117
|
+
end
|
118
|
+
|
119
|
+
image[:format] = File.extname(url).gsub(".", "")
|
120
|
+
|
121
|
+
if tested_images.include?(url)
|
122
|
+
debug("Image was tested: #{url}")
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
tested_images.push(url)
|
127
|
+
if image_meets_criteria?(image)
|
128
|
+
list_images << url
|
129
|
+
else
|
130
|
+
debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
135
|
+
end
|
136
|
+
|
137
|
+
def get_image_size(url)
|
138
|
+
w, h = FastImage.size(url)
|
139
|
+
raise "Couldn't get size." if w.nil? || h.nil?
|
140
|
+
{:width => w, :height => h}
|
141
|
+
rescue => e
|
142
|
+
debug("Image error: #{e}")
|
143
|
+
nil
|
144
|
+
end
|
145
|
+
|
146
|
+
def image_meets_criteria?(image)
|
147
|
+
return false if options[:ignore_image_format].include?(image[:format].downcase)
|
148
|
+
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
|
149
|
+
end
|
150
|
+
|
151
|
+
REGEXES = {
|
152
|
+
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
153
|
+
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
154
|
+
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
155
|
+
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
156
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
157
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
158
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
159
|
+
:trimRe => /^\s+|\s+$/,
|
160
|
+
:normalizeRe => /\s{2,}/,
|
161
|
+
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
162
|
+
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
163
|
+
}
|
164
|
+
|
165
|
+
def title
|
166
|
+
title = @html.css("title").first
|
167
|
+
title ? title.text : nil
|
168
|
+
end
|
169
|
+
|
170
|
+
# Look through the @html document looking for the author
|
171
|
+
# Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
|
172
|
+
# Returns nil if no author is detected
|
173
|
+
def author
|
174
|
+
# Let's grab this author:
|
175
|
+
# <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
|
176
|
+
author_elements = @html.xpath('//meta[@name = "dc.creator"]')
|
177
|
+
unless author_elements.empty?
|
178
|
+
author_elements.each do |element|
|
179
|
+
return element['content'].strip if element['content']
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Now let's try to grab this
|
184
|
+
# <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
|
185
|
+
# <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
|
186
|
+
author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
|
187
|
+
unless author_elements.empty?
|
188
|
+
author_elements.each do |element|
|
189
|
+
return element.text.strip if element.text
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Now let's try to grab this
|
194
|
+
# <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
|
195
|
+
# TODO: strip out the (rel)?
|
196
|
+
author_elements = @html.xpath('//a[@rel = "author"]')
|
197
|
+
unless author_elements.empty?
|
198
|
+
author_elements.each do |element|
|
199
|
+
return element.text.strip if element.text
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
author_elements = @html.xpath('//*[@id = "author"]')
|
204
|
+
unless author_elements.empty?
|
205
|
+
author_elements.each do |element|
|
206
|
+
return element.text.strip if element.text
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def content(remove_unlikely_candidates = :default)
|
212
|
+
@remove_unlikely_candidates = false if remove_unlikely_candidates == false
|
213
|
+
|
214
|
+
prepare_candidates
|
215
|
+
article = get_article(@candidates, @best_candidate)
|
216
|
+
|
217
|
+
cleaned_article = sanitize(article, @candidates, options)
|
218
|
+
if article.text.strip.length < options[:retry_length]
|
219
|
+
if @remove_unlikely_candidates
|
220
|
+
@remove_unlikely_candidates = false
|
221
|
+
elsif @weight_classes
|
222
|
+
@weight_classes = false
|
223
|
+
elsif @clean_conditionally
|
224
|
+
@clean_conditionally = false
|
225
|
+
else
|
226
|
+
# nothing we can do
|
227
|
+
return cleaned_article
|
228
|
+
end
|
229
|
+
|
230
|
+
make_html
|
231
|
+
content
|
232
|
+
else
|
233
|
+
cleaned_article
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def get_article(candidates, best_candidate)
|
238
|
+
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
239
|
+
# Things like preambles, content split by ads that we removed, etc.
|
240
|
+
|
241
|
+
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
242
|
+
output = Nokogiri::XML::Node.new('div', @html)
|
243
|
+
best_candidate[:elem].parent.children.each do |sibling|
|
244
|
+
append = false
|
245
|
+
append = true if sibling == best_candidate[:elem]
|
246
|
+
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
247
|
+
|
248
|
+
if sibling.name.downcase == "p"
|
249
|
+
link_density = get_link_density(sibling)
|
250
|
+
node_content = sibling.text
|
251
|
+
node_length = node_content.length
|
252
|
+
|
253
|
+
append = if node_length > 80 && link_density < 0.25
|
254
|
+
true
|
255
|
+
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
256
|
+
true
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
if append
|
261
|
+
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
|
262
|
+
sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
263
|
+
output << sibling_dup
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
output
|
268
|
+
end
|
269
|
+
|
270
|
+
def select_best_candidate(candidates)
|
271
|
+
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
272
|
+
|
273
|
+
debug("Top 5 candidates:")
|
274
|
+
sorted_candidates[0...5].each do |candidate|
|
275
|
+
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
276
|
+
end
|
277
|
+
|
278
|
+
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
|
279
|
+
debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
|
280
|
+
|
281
|
+
best_candidate
|
282
|
+
end
|
283
|
+
|
284
|
+
def get_link_density(elem)
|
285
|
+
link_length = elem.css("a").map(&:text).join("").length
|
286
|
+
text_length = elem.text.length
|
287
|
+
link_length / text_length.to_f
|
288
|
+
end
|
289
|
+
|
290
|
+
def score_paragraphs(min_text_length)
|
291
|
+
candidates = {}
|
292
|
+
@html.css("p,td").each do |elem|
|
293
|
+
parent_node = elem.parent
|
294
|
+
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
295
|
+
inner_text = elem.text
|
296
|
+
|
297
|
+
# If this paragraph is less than 25 characters, don't even count it.
|
298
|
+
next if inner_text.length < min_text_length
|
299
|
+
|
300
|
+
candidates[parent_node] ||= score_node(parent_node)
|
301
|
+
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
|
302
|
+
|
303
|
+
content_score = 1
|
304
|
+
content_score += inner_text.split(',').length
|
305
|
+
content_score += [(inner_text.length / 100).to_i, 3].min
|
306
|
+
|
307
|
+
candidates[parent_node][:content_score] += content_score
|
308
|
+
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
|
309
|
+
end
|
310
|
+
|
311
|
+
# Scale the final candidates score based on link density. Good content should have a
|
312
|
+
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
313
|
+
candidates.each do |elem, candidate|
|
314
|
+
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
|
315
|
+
end
|
316
|
+
|
317
|
+
candidates
|
318
|
+
end
|
319
|
+
|
320
|
+
def class_weight(e)
|
321
|
+
weight = 0
|
322
|
+
return weight unless @weight_classes
|
323
|
+
|
324
|
+
if e[:class] && e[:class] != ""
|
325
|
+
weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
|
326
|
+
weight += 25 if e[:class] =~ REGEXES[:positiveRe]
|
327
|
+
end
|
328
|
+
|
329
|
+
if e[:id] && e[:id] != ""
|
330
|
+
weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
|
331
|
+
weight += 25 if e[:id] =~ REGEXES[:positiveRe]
|
332
|
+
end
|
333
|
+
|
334
|
+
weight
|
335
|
+
end
|
336
|
+
|
337
|
+
ELEMENT_SCORES = {
|
338
|
+
'div' => 5,
|
339
|
+
'blockquote' => 3,
|
340
|
+
'form' => -3,
|
341
|
+
'th' => -5
|
342
|
+
}.freeze
|
343
|
+
|
344
|
+
def score_node(elem)
|
345
|
+
content_score = class_weight(elem)
|
346
|
+
content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
|
347
|
+
{ :content_score => content_score, :elem => elem }
|
348
|
+
end
|
349
|
+
|
350
|
+
def debug(str)
|
351
|
+
puts str if options[:debug]
|
352
|
+
end
|
353
|
+
|
354
|
+
def remove_unlikely_candidates!
|
355
|
+
@html.css("*").each do |elem|
|
356
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
357
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
|
358
|
+
debug("Removing unlikely candidate - #{str}")
|
359
|
+
elem.remove
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
def transform_misused_divs_into_paragraphs!
|
365
|
+
@html.css("*").each do |elem|
|
366
|
+
if elem.name.downcase == "div"
|
367
|
+
# transform <div>s that do not contain other block elements into <p>s
|
368
|
+
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
369
|
+
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
370
|
+
elem.name = "p"
|
371
|
+
end
|
372
|
+
else
|
373
|
+
# wrap text nodes in p tags
|
374
|
+
# elem.children.each do |child|
|
375
|
+
# if child.text?
|
376
|
+
# debug("wrapping text node with a p")
|
377
|
+
# child.swap("<p>#{child.text}</p>")
|
378
|
+
# end
|
379
|
+
# end
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
def sanitize(node, candidates, options = {})
|
385
|
+
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
386
|
+
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
387
|
+
end
|
388
|
+
|
389
|
+
node.css("form, object, iframe, embed").each do |elem|
|
390
|
+
elem.remove
|
391
|
+
end
|
392
|
+
|
393
|
+
if @options[:remove_empty_nodes]
|
394
|
+
# remove <p> tags that have no text content - this will also remove p tags that contain only images.
|
395
|
+
node.css("p").each do |elem|
|
396
|
+
elem.remove if elem.content.strip.empty?
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
# Conditionally clean <table>s, <ul>s, and <div>s
|
401
|
+
clean_conditionally(node, candidates, "table, ul, div")
|
402
|
+
|
403
|
+
# We'll sanitize all elements using a whitelist
|
404
|
+
base_whitelist = @options[:tags] || %w[div p]
|
405
|
+
# We'll add whitespace instead of block elements,
|
406
|
+
# so a<br>b will have a nice space between them
|
407
|
+
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
|
408
|
+
|
409
|
+
# Use a hash for speed (don't want to make a million calls to include?)
|
410
|
+
whitelist = Hash.new
|
411
|
+
base_whitelist.each {|tag| whitelist[tag] = true }
|
412
|
+
replace_with_whitespace = Hash.new
|
413
|
+
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
|
414
|
+
|
415
|
+
([node] + node.css("*")).each do |el|
|
416
|
+
# If element is in whitelist, delete all its attributes
|
417
|
+
if whitelist[el.node_name]
|
418
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
419
|
+
|
420
|
+
# Otherwise, replace the element with its contents
|
421
|
+
else
|
422
|
+
# If element is root, replace the node as a text node
|
423
|
+
if el.parent.nil?
|
424
|
+
node = Nokogiri::XML::Text.new(el.text, el.document)
|
425
|
+
break
|
426
|
+
else
|
427
|
+
if replace_with_whitespace[el.node_name]
|
428
|
+
el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
|
429
|
+
else
|
430
|
+
el.swap(Nokogiri::XML::Text.new(el.text, el.document))
|
431
|
+
end
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
end
|
436
|
+
|
437
|
+
s = Nokogiri::XML::Node::SaveOptions
|
438
|
+
save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
|
439
|
+
html = node.serialize(:save_with => save_opts)
|
440
|
+
|
441
|
+
# Get rid of duplicate whitespace
|
442
|
+
return html.gsub(/[\r\n\f]+/, "\n" )
|
443
|
+
end
|
444
|
+
|
445
|
+
def clean_conditionally(node, candidates, selector)
|
446
|
+
return unless @clean_conditionally
|
447
|
+
node.css(selector).each do |el|
|
448
|
+
weight = class_weight(el)
|
449
|
+
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
450
|
+
name = el.name.downcase
|
451
|
+
|
452
|
+
if weight + content_score < 0
|
453
|
+
el.remove
|
454
|
+
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
455
|
+
elsif el.text.count(",") < 10
|
456
|
+
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
457
|
+
counts["li"] -= 100
|
458
|
+
|
459
|
+
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
460
|
+
link_density = get_link_density(el)
|
461
|
+
|
462
|
+
reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density)
|
463
|
+
if reason
|
464
|
+
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
465
|
+
el.remove
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
469
|
+
end
|
470
|
+
|
471
|
+
def clean_conditionally_reason?(counts, content_length, options, weight, link_density)
|
472
|
+
if counts["img"] > counts["p"]
|
473
|
+
"too many images"
|
474
|
+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
475
|
+
"more <li>s than <p>s"
|
476
|
+
elsif counts["input"] > (counts["p"] / 3).to_i
|
477
|
+
"less than 3x <p>s than <input>s"
|
478
|
+
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
|
479
|
+
"too short a content length without a single image"
|
480
|
+
elsif weight < 25 && link_density > 0.2
|
481
|
+
"too many links for its weight (#{weight})"
|
482
|
+
elsif weight >= 25 && link_density > 0.5
|
483
|
+
"too many links for its weight (#{weight})"
|
484
|
+
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
485
|
+
"<embed>s with too short a content length, or too many <embed>s"
|
486
|
+
else
|
487
|
+
nil
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
end
|
492
|
+
end
|