rgabo-readability 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +34 -0
- data/Rakefile +72 -0
- data/VERSION +1 -0
- data/example.rb +14 -0
- data/lib/readability.rb +45 -0
- data/lib/readability/harmonizable.rb +52 -0
- data/lib/readability/js/readability.js +1079 -0
- data/lib/readability/readable.rb +51 -0
- data/readability.gems +30 -0
- data/rgabo-readability.gemspec +72 -0
- data/spec/files/change_title.js +1 -0
- data/spec/files/tomdoc-reasonable-ruby-documentation.html +123 -0
- data/spec/readability/harmonizable_spec.rb +36 -0
- data/spec/readability/readable_spec.rb +50 -0
- data/spec/readability_spec.rb +4 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +136 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Gabor Ratky
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
= readability
|
2
|
+
|
3
|
+
The readability gem makes it easy to run Arc90's Readability script in Ruby using Nokogiri and Harmony. Harmony uses Johnson to run env.js in Ruby.
|
4
|
+
|
5
|
+
Example:
|
6
|
+
|
7
|
+
require 'rubygems'
|
8
|
+
require 'readability'
|
9
|
+
require 'open-uri'
|
10
|
+
|
11
|
+
# load document with Nokogiri
|
12
|
+
doc = Nokogiri::HTML(open('http://ajaxian.com/archives/johnson-wrapping-javascript-in-a-loving-ruby-embrace-and-arax'))
|
13
|
+
|
14
|
+
# set Readability parameters
|
15
|
+
doc.read_style = Readability::Style::NEWSPAPER
|
16
|
+
doc.read_size = Readability::Size::MEDIUM
|
17
|
+
doc.read_margin = Readability::Margin::MEDIUM
|
18
|
+
|
19
|
+
# Print result after Readability has been run
|
20
|
+
puts doc.to_readable
|
21
|
+
|
22
|
+
== Note on Patches/Pull Requests
|
23
|
+
|
24
|
+
* Fork the project.
|
25
|
+
* Make your feature addition or bug fix.
|
26
|
+
* Add tests for it. This is important so I don't break it in a
|
27
|
+
future version unintentionally.
|
28
|
+
* Commit, do not mess with Rakefile, VERSION, or LICENSE.
|
29
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
30
|
+
* Send me a pull request. Bonus points for topic branches.
|
31
|
+
|
32
|
+
== Copyright
|
33
|
+
|
34
|
+
Copyright (c) 2010 Gabor Ratky. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "rgabo-readability"
|
8
|
+
gem.summary = %Q{Run Arc90's Readability on Nokogiri documents}
|
9
|
+
gem.description = %Q{Extends Nokogiri::HTML::Document to run Arc90's Readability and procude easy to read HTML documents.}
|
10
|
+
gem.email = "rgabo@rgabostyle.com"
|
11
|
+
gem.homepage = "http://github.com/rgabo/readability"
|
12
|
+
gem.authors = ["Gabor Ratky"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.3.0"
|
14
|
+
gem.add_runtime_dependency "harmony", "0.5.5"
|
15
|
+
gem.add_runtime_dependency "nokogiri", "~> 1.4"
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
+
spec.libs << 'lib' << 'spec'
|
26
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.rcov = true
|
33
|
+
end
|
34
|
+
|
35
|
+
task :spec => :check_dependencies
|
36
|
+
|
37
|
+
begin
|
38
|
+
require 'reek/adapters/rake_task'
|
39
|
+
Reek::RakeTask.new do |t|
|
40
|
+
t.fail_on_error = true
|
41
|
+
t.verbose = false
|
42
|
+
t.source_files = 'lib/**/*.rb'
|
43
|
+
end
|
44
|
+
rescue LoadError
|
45
|
+
task :reek do
|
46
|
+
abort "Reek is not available. In order to run reek, you must: gem install reek"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
begin
|
51
|
+
require 'roodi'
|
52
|
+
require 'roodi_task'
|
53
|
+
RoodiTask.new do |t|
|
54
|
+
t.verbose = false
|
55
|
+
end
|
56
|
+
rescue LoadError
|
57
|
+
task :roodi do
|
58
|
+
abort "Roodi is not available. In order to run roodi, you must: gem install roodi"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
task :default => :spec
|
63
|
+
|
64
|
+
require 'rake/rdoctask'
|
65
|
+
Rake::RDocTask.new do |rdoc|
|
66
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
67
|
+
|
68
|
+
rdoc.rdoc_dir = 'rdoc'
|
69
|
+
rdoc.title = "readability #{version}"
|
70
|
+
rdoc.rdoc_files.include('README*')
|
71
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
72
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/example.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'readability'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
# load document with Nokogiri
|
6
|
+
doc = Nokogiri::HTML(open(ARGV.first))
|
7
|
+
|
8
|
+
# set Readability parameters
|
9
|
+
doc.read_style = Readability::Style::NEWSPAPER
|
10
|
+
doc.read_size = Readability::Size::MEDIUM
|
11
|
+
doc.read_margin = Readability::Margin::MEDIUM
|
12
|
+
|
13
|
+
# Print result after Readability has been run
|
14
|
+
puts doc.to_readable
|
data/lib/readability.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# ensure that lib is in the load path
|
2
|
+
$:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'harmony'
|
7
|
+
|
8
|
+
require 'readability/harmonizable'
|
9
|
+
require 'readability/readable'
|
10
|
+
|
11
|
+
# Run the Arc90 Lab Experiment Readability on a Nokogiri document.
|
12
|
+
# TODO: Add example
|
13
|
+
#
|
14
|
+
module Readability
|
15
|
+
module Style
|
16
|
+
NEWSPAPER = "style-newspaper"
|
17
|
+
NOVEL = "style-novel"
|
18
|
+
EBOOK = "style-ebook"
|
19
|
+
TERMINAL = "style-terminal"
|
20
|
+
APERTURA = "style-apertura"
|
21
|
+
ATHELAS = "style-athelas"
|
22
|
+
end
|
23
|
+
|
24
|
+
module Size
|
25
|
+
XSMALL = "size-x-small"
|
26
|
+
SMALL = "size-small"
|
27
|
+
MEDIUM = "size-medium"
|
28
|
+
LARGE = "size-large"
|
29
|
+
XLARGE = "size-x-large"
|
30
|
+
end
|
31
|
+
|
32
|
+
module Margin
|
33
|
+
XNARROW = "margin-x-narrow"
|
34
|
+
NARROW = "margin-narrow"
|
35
|
+
MEDIUM = "margin-medium"
|
36
|
+
WIDE = "margin-wide"
|
37
|
+
XWIDE = "margin-x-wide"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Nokogiri::HTML::Document
|
42
|
+
include Readability::Readable
|
43
|
+
end
|
44
|
+
|
45
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Readability
|
2
|
+
module Harmonizable
|
3
|
+
def window
|
4
|
+
if block_given?
|
5
|
+
harmony_page do |page|
|
6
|
+
yield page.window
|
7
|
+
page.window
|
8
|
+
end
|
9
|
+
else
|
10
|
+
harmony_page.window
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse string_or_io, url = nil, encoding = nil, options = Nokogiri::XML::ParseOptions::DEFAULT_HTML, &block
|
15
|
+
self.root = Nokogiri::HTML::Document.parse(string_or_io, url, encoding, options, &block).root
|
16
|
+
end
|
17
|
+
|
18
|
+
def execute_js(code)
|
19
|
+
result = nil
|
20
|
+
|
21
|
+
harmony_page do |page|
|
22
|
+
result = page.execute_js(code)
|
23
|
+
end
|
24
|
+
|
25
|
+
result
|
26
|
+
end
|
27
|
+
alias :x :execute_js
|
28
|
+
|
29
|
+
def load_js(*paths)
|
30
|
+
harmony_page do |page|
|
31
|
+
page.load(*paths)
|
32
|
+
end
|
33
|
+
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
def harmony_page
|
38
|
+
# load document into a page
|
39
|
+
page = Harmony::Page.new(self.to_html)
|
40
|
+
|
41
|
+
# yield the page and reparse if a block is given
|
42
|
+
if block_given?
|
43
|
+
yield page
|
44
|
+
|
45
|
+
# parse the page back into the document
|
46
|
+
parse(page.to_html)
|
47
|
+
end
|
48
|
+
|
49
|
+
page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,1079 @@
|
|
1
|
+
var dbg = function(s) {
|
2
|
+
if(typeof console !== 'undefined') {
|
3
|
+
console.log("Readability: " + s);
|
4
|
+
}
|
5
|
+
};
|
6
|
+
|
7
|
+
/*
|
8
|
+
* Readability. An Arc90 Lab Experiment.
|
9
|
+
* Website: http://lab.arc90.com/experiments/readability
|
10
|
+
* Source: http://code.google.com/p/arc90labs-readability
|
11
|
+
*
|
12
|
+
* Copyright (c) 2009 Arc90 Inc
|
13
|
+
* Readability is licensed under the Apache License, Version 2.0.
|
14
|
+
**/
|
15
|
+
var readability = {
|
16
|
+
version: '1.5.0',
|
17
|
+
emailSrc: 'http://lab.arc90.com/experiments/readability/email.php',
|
18
|
+
iframeLoads: 0,
|
19
|
+
frameHack: false, /**
|
20
|
+
* The frame hack is to workaround a firefox bug where if you
|
21
|
+
* pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
|
22
|
+
* So we fake a scrollbar in the wrapping div.
|
23
|
+
**/
|
24
|
+
bodyCache: null, /* Cache the body HTML in case we need to re-use it later */
|
25
|
+
flags: 0x1 | 0x2, /* Start with both flags set. */
|
26
|
+
|
27
|
+
/* constants */
|
28
|
+
FLAG_STRIP_UNLIKELYS: 0x1,
|
29
|
+
FLAG_WEIGHT_CLASSES: 0x2,
|
30
|
+
|
31
|
+
/**
|
32
|
+
* All of the regular expressions in use within readability.
|
33
|
+
* Defined up here so we don't instantiate them repeatedly in loops.
|
34
|
+
**/
|
35
|
+
regexps: {
|
36
|
+
unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor/i,
|
37
|
+
okMaybeItsACandidateRe: /and|article|body|column|main/i,
|
38
|
+
positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
|
39
|
+
negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
|
40
|
+
divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
41
|
+
replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
|
42
|
+
replaceFontsRe: /<(\/?)font[^>]*>/gi,
|
43
|
+
trimRe: /^\s+|\s+$/g,
|
44
|
+
normalizeRe: /\s{2,}/g,
|
45
|
+
killBreaksRe: /(<br\s*\/?>(\s| ?)*){1,}/g,
|
46
|
+
videoRe: /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
47
|
+
},
|
48
|
+
|
49
|
+
/**
|
50
|
+
* Runs readability.
|
51
|
+
*
|
52
|
+
* Workflow:
|
53
|
+
* 1. Prep the document by removing script tags, css, etc.
|
54
|
+
* 2. Build readability's DOM tree.
|
55
|
+
* 3. Grab the article content from the current dom tree.
|
56
|
+
* 4. Replace the current DOM tree with the new one.
|
57
|
+
* 5. Read peacefully.
|
58
|
+
*
|
59
|
+
* @return void
|
60
|
+
**/
|
61
|
+
init: function() {
|
62
|
+
document.body.style.display = "none";
|
63
|
+
if(document.body && !readability.bodyCache) {
|
64
|
+
readability.bodyCache = document.body.innerHTML; }
|
65
|
+
|
66
|
+
readability.prepDocument();
|
67
|
+
|
68
|
+
/* Build readability's DOM tree */
|
69
|
+
var overlay = document.createElement("DIV");
|
70
|
+
var innerDiv = document.createElement("DIV");
|
71
|
+
var articleTools = readability.getArticleTools();
|
72
|
+
var articleTitle = readability.getArticleTitle();
|
73
|
+
var articleContent = readability.grabArticle();
|
74
|
+
var articleFooter = readability.getArticleFooter();
|
75
|
+
|
76
|
+
/**
|
77
|
+
* If we attempted to strip unlikely candidates on the first run through, and we ended up with no content,
|
78
|
+
* that may mean we stripped out the actual content so we couldn't parse it. So re-run init while preserving
|
79
|
+
* unlikely candidates to have a better shot at getting our content out properly.
|
80
|
+
**/
|
81
|
+
if(readability.getInnerText(articleContent, false).length < 250)
|
82
|
+
{
|
83
|
+
if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
|
84
|
+
readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
|
85
|
+
document.body.innerHTML = readability.bodyCache;
|
86
|
+
return readability.init();
|
87
|
+
}
|
88
|
+
else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
|
89
|
+
readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
|
90
|
+
document.body.innerHTML = readability.bodyCache;
|
91
|
+
return readability.init();
|
92
|
+
}
|
93
|
+
else {
|
94
|
+
articleContent.innerHTML = "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p><p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>";
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
overlay.id = "readOverlay";
|
99
|
+
innerDiv.id = "readInner";
|
100
|
+
|
101
|
+
/* Apply user-selected styling */
|
102
|
+
document.body.className = readStyle;
|
103
|
+
if (readStyle == "style-athelas" || readStyle == "style-apertura"){
|
104
|
+
overlay.className = readStyle + " rdbTypekit";
|
105
|
+
}
|
106
|
+
else {
|
107
|
+
overlay.className = readStyle;
|
108
|
+
}
|
109
|
+
innerDiv.className = readMargin + " " + readSize;
|
110
|
+
|
111
|
+
/* Glue the structure of our document together. */
|
112
|
+
// articleContent.appendChild( articleFooter );
|
113
|
+
innerDiv.appendChild( articleTitle );
|
114
|
+
innerDiv.appendChild( articleContent );
|
115
|
+
innerDiv.appendChild( articleFooter );
|
116
|
+
overlay.appendChild( articleTools );
|
117
|
+
overlay.appendChild( innerDiv );
|
118
|
+
|
119
|
+
/* Clear the old HTML, insert the new content. */
|
120
|
+
document.body.innerHTML = "";
|
121
|
+
document.body.insertBefore(overlay, document.body.firstChild);
|
122
|
+
|
123
|
+
if(readability.frameHack)
|
124
|
+
{
|
125
|
+
var readOverlay = document.getElementById('readOverlay');
|
126
|
+
readOverlay.style.height = '100%';
|
127
|
+
readOverlay.style.overflow = 'auto';
|
128
|
+
}
|
129
|
+
|
130
|
+
/**
|
131
|
+
* If someone tries to use Readability on a site's root page, give them a warning about usage.
|
132
|
+
**/
|
133
|
+
if((window.location.protocol + "//" + window.location.host + "/") == window.location.href)
|
134
|
+
{
|
135
|
+
articleContent.style.display = "none";
|
136
|
+
var rootWarning = document.createElement('p');
|
137
|
+
rootWarning.id = "readability-warning";
|
138
|
+
rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
|
139
|
+
"If you'd like to try rendering this page anyways, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
|
140
|
+
|
141
|
+
innerDiv.insertBefore( rootWarning, articleContent );
|
142
|
+
}
|
143
|
+
document.body.style.display = "block";
|
144
|
+
|
145
|
+
window.scrollTo(0, 0);
|
146
|
+
|
147
|
+
/* If we're using the Typekit library, select the font */
|
148
|
+
if (readStyle == "style-athelas" || readStyle == "style-apertura") {
|
149
|
+
readability.useRdbTypekit();
|
150
|
+
}
|
151
|
+
},
|
152
|
+
|
153
|
+
/**
|
154
|
+
* Get the article tools Element that has buttons like reload, print, email.
|
155
|
+
*
|
156
|
+
* @return void
|
157
|
+
**/
|
158
|
+
getArticleTools: function () {
|
159
|
+
var articleTools = document.createElement("DIV");
|
160
|
+
|
161
|
+
articleTools.id = "readTools";
|
162
|
+
articleTools.innerHTML =
|
163
|
+
"<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
|
164
|
+
"<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
|
165
|
+
"<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
|
166
|
+
|
167
|
+
return articleTools;
|
168
|
+
},
|
169
|
+
|
170
|
+
/**
|
171
|
+
* Get the article title as an H1.
|
172
|
+
*
|
173
|
+
* @return void
|
174
|
+
**/
|
175
|
+
getArticleTitle: function () {
|
176
|
+
var curTitle = "",
|
177
|
+
origTitle = "";
|
178
|
+
|
179
|
+
try {
|
180
|
+
curTitle = origTitle = document.title;
|
181
|
+
|
182
|
+
if(typeof curTitle != "string") { /* If they had an element with id "title" in their HTML */
|
183
|
+
curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
|
184
|
+
}
|
185
|
+
}
|
186
|
+
catch(e) {}
|
187
|
+
|
188
|
+
if(curTitle.match(/ [\|\-] /))
|
189
|
+
{
|
190
|
+
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
|
191
|
+
|
192
|
+
if(curTitle.split(' ').length < 3) {
|
193
|
+
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
|
194
|
+
}
|
195
|
+
}
|
196
|
+
else if(curTitle.indexOf(': ') !== -1)
|
197
|
+
{
|
198
|
+
curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
|
199
|
+
|
200
|
+
if(curTitle.split(' ').length < 3) {
|
201
|
+
curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
|
202
|
+
}
|
203
|
+
}
|
204
|
+
else if(curTitle.length > 150 || curTitle.length < 15)
|
205
|
+
{
|
206
|
+
var hOnes = document.getElementsByTagName('h1');
|
207
|
+
if(hOnes.length == 1)
|
208
|
+
{
|
209
|
+
curTitle = readability.getInnerText(hOnes[0]);
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
curTitle = curTitle.replace( readability.regexps.trimRe, "" );
|
214
|
+
|
215
|
+
if(curTitle.split(' ').length <= 4) {
|
216
|
+
curTitle = origTitle;
|
217
|
+
}
|
218
|
+
|
219
|
+
var articleTitle = document.createElement("H1");
|
220
|
+
articleTitle.innerHTML = curTitle;
|
221
|
+
|
222
|
+
return articleTitle;
|
223
|
+
},
|
224
|
+
|
225
|
+
/**
|
226
|
+
* Get the footer with the readability mark etc.
|
227
|
+
*
|
228
|
+
* @return void
|
229
|
+
**/
|
230
|
+
getArticleFooter: function () {
|
231
|
+
var articleFooter = document.createElement("DIV");
|
232
|
+
|
233
|
+
/**
|
234
|
+
* For research purposes, generate an img src that contains the chosen readstyle etc,
|
235
|
+
* so we can generate aggregate stats and change styles based on them in the future
|
236
|
+
**/
|
237
|
+
// var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
|
238
|
+
/* TODO: attach this to an image */
|
239
|
+
|
240
|
+
var twitterLink = document.createElement('a');
|
241
|
+
twitterLink.setAttribute('href','http://lab.arc90.com/experiments/readability');
|
242
|
+
twitterLink.setAttribute('id','footer-twitterLink');
|
243
|
+
twitterLink.setAttribute('title','Follow Arc90 on Twitter');
|
244
|
+
twitterLink.innerHTML = "Follow us on Twitter »";
|
245
|
+
|
246
|
+
articleFooter.id = "readFooter";
|
247
|
+
articleFooter.innerHTML =
|
248
|
+
"<div id='rdb-footer-left'>" +
|
249
|
+
"<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability — </a>" +
|
250
|
+
"<a href='http://www.arc90.com/' id='arc90-logo'>An Arc90 Laboratory Experiment</a>" +
|
251
|
+
"<span id='readability-url'> — http://lab.arc90.com/experiments/readability</span>" +
|
252
|
+
"<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter »</a>" +
|
253
|
+
"</div>" +
|
254
|
+
"<div id='rdb-footer-right'>" +
|
255
|
+
"<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter »</a>" +
|
256
|
+
"<span class='version'>Readability version " + readability.version + "</span>" +
|
257
|
+
"</div>";
|
258
|
+
|
259
|
+
// if (readStyle == ("style-athelas" || "style-apertura")) {
|
260
|
+
// console.log("Using Typekit Footer");
|
261
|
+
// getElementById("rdb-footer-logo").appendChild(twitterLink);
|
262
|
+
// }
|
263
|
+
// else {
|
264
|
+
// console.log("Using Normal Footer");
|
265
|
+
// articleFooter.getElementById("rdb-footer-right").appendChild(twitterLink);
|
266
|
+
// }
|
267
|
+
|
268
|
+
return articleFooter;
|
269
|
+
},
|
270
|
+
|
271
|
+
/**
|
272
|
+
* Prepare the HTML document for readability to scrape it.
|
273
|
+
* This includes things like stripping javascript, CSS, and handling terrible markup.
|
274
|
+
*
|
275
|
+
* @return void
|
276
|
+
**/
|
277
|
+
prepDocument: function () {
|
278
|
+
/**
|
279
|
+
* In some cases a body element can't be found (if the HTML is totally hosed for example)
|
280
|
+
* so we create a new body node and append it to the document.
|
281
|
+
*/
|
282
|
+
if(document.body === null)
|
283
|
+
{
|
284
|
+
var body = document.createElement("body");
|
285
|
+
try {
|
286
|
+
document.body = body;
|
287
|
+
}
|
288
|
+
catch(e) {
|
289
|
+
document.documentElement.appendChild(body);
|
290
|
+
dbg(e);
|
291
|
+
}
|
292
|
+
}
|
293
|
+
|
294
|
+
var frames = document.getElementsByTagName('frame');
|
295
|
+
if(frames.length > 0)
|
296
|
+
{
|
297
|
+
var bestFrame = null;
|
298
|
+
var bestFrameSize = 0;
|
299
|
+
for(var frameIndex = 0; frameIndex < frames.length; frameIndex++)
|
300
|
+
{
|
301
|
+
var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
|
302
|
+
var canAccessFrame = false;
|
303
|
+
try {
|
304
|
+
frames[frameIndex].contentWindow.document.body;
|
305
|
+
canAccessFrame = true;
|
306
|
+
}
|
307
|
+
catch(eFrames) {
|
308
|
+
dbg(eFrames);
|
309
|
+
}
|
310
|
+
|
311
|
+
if(canAccessFrame && frameSize > bestFrameSize)
|
312
|
+
{
|
313
|
+
bestFrame = frames[frameIndex];
|
314
|
+
bestFrameSize = frameSize;
|
315
|
+
}
|
316
|
+
}
|
317
|
+
|
318
|
+
if(bestFrame)
|
319
|
+
{
|
320
|
+
var newBody = document.createElement('body');
|
321
|
+
newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
|
322
|
+
newBody.style.overflow = 'scroll';
|
323
|
+
document.body = newBody;
|
324
|
+
|
325
|
+
var frameset = document.getElementsByTagName('frameset')[0];
|
326
|
+
if(frameset) {
|
327
|
+
frameset.parentNode.removeChild(frameset); }
|
328
|
+
|
329
|
+
readability.frameHack = true;
|
330
|
+
}
|
331
|
+
}
|
332
|
+
|
333
|
+
/* remove all scripts that are not readability */
|
334
|
+
var scripts = document.getElementsByTagName('script');
|
335
|
+
for(var i = scripts.length-1; i >= 0; i--)
|
336
|
+
{
|
337
|
+
if(scripts[i].src == null || typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1))
|
338
|
+
{
|
339
|
+
scripts[i].parentNode.removeChild(scripts[i]);
|
340
|
+
}
|
341
|
+
}
|
342
|
+
|
343
|
+
/* remove all stylesheets */
|
344
|
+
for (var k=0;k < document.styleSheets.length; k++) {
|
345
|
+
if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") == -1) {
|
346
|
+
document.styleSheets[k].disabled = true;
|
347
|
+
}
|
348
|
+
}
|
349
|
+
|
350
|
+
/* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
|
351
|
+
var styleTags = document.getElementsByTagName("style");
|
352
|
+
for (var st=0;st < styleTags.length; st++) {
|
353
|
+
if (navigator.appName != "Microsoft Internet Explorer") {
|
354
|
+
styleTags[st].textContent = ""; }
|
355
|
+
}
|
356
|
+
|
357
|
+
/* Turn all double br's into p's */
|
358
|
+
/* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
|
359
|
+
document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrsRe, '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
|
360
|
+
},
|
361
|
+
|
362
|
+
useRdbTypekit: function () {
|
363
|
+
var rdbHead = document.getElementsByTagName('head')[0];
|
364
|
+
var rdbTKScript = document.createElement('script');
|
365
|
+
var rdbTKCode = null;
|
366
|
+
|
367
|
+
var rdbTKLink = document.createElement('a');
|
368
|
+
rdbTKLink.setAttribute('class','rdbTK-powered');
|
369
|
+
rdbTKLink.setAttribute('title','Fonts by Typekit');
|
370
|
+
rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
|
371
|
+
|
372
|
+
if (readStyle == "style-athelas") {
|
373
|
+
rdbTKCode = "sxt6vzy";
|
374
|
+
dbg("Using Athelas Theme");
|
375
|
+
|
376
|
+
rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
|
377
|
+
rdbTKLink.setAttribute('id','rdb-athelas');
|
378
|
+
document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
|
379
|
+
}
|
380
|
+
if (readStyle == "style-apertura") {
|
381
|
+
rdbTKCode = "bae8ybu";
|
382
|
+
dbg("Using Inverse Theme");
|
383
|
+
|
384
|
+
rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
|
385
|
+
rdbTKLink.setAttribute('id','rdb-inverse');
|
386
|
+
document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
|
387
|
+
}
|
388
|
+
|
389
|
+
/**
|
390
|
+
* Setting new script tag attributes to pull Typekits libraries
|
391
|
+
**/
|
392
|
+
rdbTKScript.setAttribute('type','text/javascript');
|
393
|
+
rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
|
394
|
+
rdbTKScript.setAttribute('charset','UTF-8');
|
395
|
+
rdbHead.appendChild(rdbTKScript);
|
396
|
+
|
397
|
+
/**
|
398
|
+
* In the future, maybe try using the following experimental Callback function?:
|
399
|
+
* http://gist.github.com/192350
|
400
|
+
* &
|
401
|
+
* http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
|
402
|
+
**/
|
403
|
+
var typekitLoader = function() {
|
404
|
+
dbg("Looking for Typekit.");
|
405
|
+
if(typeof Typekit != "undefined") {
|
406
|
+
try {
|
407
|
+
dbg("Caught typekit");
|
408
|
+
Typekit.load();
|
409
|
+
clearInterval(window.typekitInterval);
|
410
|
+
} catch(e) {
|
411
|
+
dbg("Typekit error: " + e);
|
412
|
+
}
|
413
|
+
}
|
414
|
+
};
|
415
|
+
|
416
|
+
window.typekitInterval = window.setInterval(typekitLoader, 100);
|
417
|
+
},
|
418
|
+
|
419
|
+
/**
|
420
|
+
* Prepare the article node for display. Clean out any inline styles,
|
421
|
+
* iframes, forms, strip extraneous <p> tags, etc.
|
422
|
+
*
|
423
|
+
* @param Element
|
424
|
+
* @return void
|
425
|
+
**/
|
426
|
+
prepArticle: function (articleContent) {
|
427
|
+
readability.cleanStyles(articleContent);
|
428
|
+
readability.killBreaks(articleContent);
|
429
|
+
|
430
|
+
/* Clean out junk from the article content */
|
431
|
+
readability.clean(articleContent, "form");
|
432
|
+
readability.clean(articleContent, "object");
|
433
|
+
readability.clean(articleContent, "h1");
|
434
|
+
/**
|
435
|
+
* If there is only one h2, they are probably using it
|
436
|
+
* as a header and not a subheader, so remove it since we already have a header.
|
437
|
+
***/
|
438
|
+
if(articleContent.getElementsByTagName('h2').length == 1) {
|
439
|
+
readability.clean(articleContent, "h2"); }
|
440
|
+
readability.clean(articleContent, "iframe");
|
441
|
+
|
442
|
+
readability.cleanHeaders(articleContent);
|
443
|
+
|
444
|
+
/* Do these last as the previous stuff may have removed junk that will affect these */
|
445
|
+
readability.cleanConditionally(articleContent, "table");
|
446
|
+
readability.cleanConditionally(articleContent, "ul");
|
447
|
+
readability.cleanConditionally(articleContent, "div");
|
448
|
+
|
449
|
+
/* Remove extra paragraphs */
|
450
|
+
var articleParagraphs = articleContent.getElementsByTagName('p');
|
451
|
+
for(var i = articleParagraphs.length-1; i >= 0; i--)
|
452
|
+
{
|
453
|
+
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
|
454
|
+
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
|
455
|
+
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
|
456
|
+
|
457
|
+
if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '')
|
458
|
+
{
|
459
|
+
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
|
460
|
+
}
|
461
|
+
}
|
462
|
+
|
463
|
+
try {
|
464
|
+
articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
|
465
|
+
}
|
466
|
+
catch (e) {
|
467
|
+
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
|
468
|
+
}
|
469
|
+
},
|
470
|
+
|
471
|
+
/**
|
472
|
+
* Initialize a node with the readability object. Also checks the
|
473
|
+
* className/id for special names to add to its score.
|
474
|
+
*
|
475
|
+
* @param Element
|
476
|
+
* @return void
|
477
|
+
**/
|
478
|
+
initializeNode: function (node) {
|
479
|
+
node.readability = {"contentScore": 0};
|
480
|
+
|
481
|
+
switch(node.tagName) {
|
482
|
+
case 'DIV':
|
483
|
+
node.readability.contentScore += 5;
|
484
|
+
break;
|
485
|
+
|
486
|
+
case 'PRE':
|
487
|
+
case 'TD':
|
488
|
+
case 'BLOCKQUOTE':
|
489
|
+
node.readability.contentScore += 3;
|
490
|
+
break;
|
491
|
+
|
492
|
+
case 'ADDRESS':
|
493
|
+
case 'OL':
|
494
|
+
case 'UL':
|
495
|
+
case 'DL':
|
496
|
+
case 'DD':
|
497
|
+
case 'DT':
|
498
|
+
case 'LI':
|
499
|
+
case 'FORM':
|
500
|
+
node.readability.contentScore -= 3;
|
501
|
+
break;
|
502
|
+
|
503
|
+
case 'H1':
|
504
|
+
case 'H2':
|
505
|
+
case 'H3':
|
506
|
+
case 'H4':
|
507
|
+
case 'H5':
|
508
|
+
case 'H6':
|
509
|
+
case 'TH':
|
510
|
+
node.readability.contentScore -= 5;
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
|
514
|
+
node.readability.contentScore += readability.getClassWeight(node);
|
515
|
+
},
|
516
|
+
|
517
|
+
/***
|
518
|
+
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
519
|
+
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
520
|
+
*
|
521
|
+
* @return Element
|
522
|
+
**/
|
523
|
+
grabArticle: function () {
|
524
|
+
var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS);
|
525
|
+
|
526
|
+
/**
|
527
|
+
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
|
528
|
+
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
|
529
|
+
*
|
530
|
+
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
|
531
|
+
* TODO: Shouldn't this be a reverse traversal?
|
532
|
+
**/
|
533
|
+
var node = null;
|
534
|
+
var nodesToScore = [];
|
535
|
+
for(var nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++)
|
536
|
+
{
|
537
|
+
/* Remove unlikely candidates */
|
538
|
+
if (stripUnlikelyCandidates) {
|
539
|
+
var unlikelyMatchString = node.className + node.id;
|
540
|
+
if (unlikelyMatchString.search(readability.regexps.unlikelyCandidatesRe) !== -1 &&
|
541
|
+
unlikelyMatchString.search(readability.regexps.okMaybeItsACandidateRe) == -1 &&
|
542
|
+
node.tagName !== "BODY")
|
543
|
+
{
|
544
|
+
dbg("Removing unlikely candidate - " + unlikelyMatchString);
|
545
|
+
node.parentNode.removeChild(node);
|
546
|
+
nodeIndex--;
|
547
|
+
continue;
|
548
|
+
}
|
549
|
+
}
|
550
|
+
|
551
|
+
if (node.tagName === "P" || node.tagName === "TD") {
|
552
|
+
nodesToScore[nodesToScore.length] = node;
|
553
|
+
}
|
554
|
+
|
555
|
+
/* Turn all divs that don't have children block level elements into p's */
|
556
|
+
if (node.tagName === "DIV") {
|
557
|
+
if (node.innerHTML.search(readability.regexps.divToPElementsRe) === -1) {
|
558
|
+
dbg("Altering div to p");
|
559
|
+
var newNode = document.createElement('p');
|
560
|
+
try {
|
561
|
+
newNode.innerHTML = node.innerHTML;
|
562
|
+
node.parentNode.replaceChild(newNode, node);
|
563
|
+
nodeIndex--;
|
564
|
+
}
|
565
|
+
catch(e) {
|
566
|
+
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
|
567
|
+
}
|
568
|
+
}
|
569
|
+
else
|
570
|
+
{
|
571
|
+
/* EXPERIMENTAL */
|
572
|
+
for(var i = 0, il = node.childNodes.length; i < il; i++) {
|
573
|
+
var childNode = node.childNodes[i];
|
574
|
+
if(childNode.nodeType == 3) { // Node.TEXT_NODE
|
575
|
+
dbg("replacing text node with a p tag with the same content.");
|
576
|
+
var p = document.createElement('p');
|
577
|
+
p.innerHTML = childNode.nodeValue;
|
578
|
+
p.style.display = 'inline';
|
579
|
+
p.className = 'readability-styled';
|
580
|
+
childNode.parentNode.replaceChild(p, childNode);
|
581
|
+
}
|
582
|
+
}
|
583
|
+
}
|
584
|
+
}
|
585
|
+
}
|
586
|
+
|
587
|
+
/**
|
588
|
+
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
589
|
+
* Then add their score to their parent node.
|
590
|
+
*
|
591
|
+
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
|
592
|
+
**/
|
593
|
+
var candidates = [];
|
594
|
+
for (var pt=0; pt < nodesToScore.length; pt++) {
|
595
|
+
var parentNode = nodesToScore[pt].parentNode;
|
596
|
+
var grandParentNode = parentNode.parentNode;
|
597
|
+
var innerText = readability.getInnerText(nodesToScore[pt]);
|
598
|
+
|
599
|
+
/* If this paragraph is less than 25 characters, don't even count it. */
|
600
|
+
if(innerText.length < 25) {
|
601
|
+
continue; }
|
602
|
+
|
603
|
+
/* Initialize readability data for the parent. */
|
604
|
+
if(typeof parentNode.readability == 'undefined')
|
605
|
+
{
|
606
|
+
readability.initializeNode(parentNode);
|
607
|
+
candidates.push(parentNode);
|
608
|
+
}
|
609
|
+
|
610
|
+
/* Initialize readability data for the grandparent. */
|
611
|
+
if(typeof grandParentNode.readability == 'undefined')
|
612
|
+
{
|
613
|
+
readability.initializeNode(grandParentNode);
|
614
|
+
candidates.push(grandParentNode);
|
615
|
+
}
|
616
|
+
|
617
|
+
var contentScore = 0;
|
618
|
+
|
619
|
+
/* Add a point for the paragraph itself as a base. */
|
620
|
+
contentScore++;
|
621
|
+
|
622
|
+
/* Add points for any commas within this paragraph */
|
623
|
+
contentScore += innerText.split(',').length;
|
624
|
+
|
625
|
+
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
|
626
|
+
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
|
627
|
+
|
628
|
+
/* Add the score to the parent. The grandparent gets half. */
|
629
|
+
parentNode.readability.contentScore += contentScore;
|
630
|
+
grandParentNode.readability.contentScore += contentScore/2;
|
631
|
+
}
|
632
|
+
|
633
|
+
/**
|
634
|
+
* After we've calculated scores, loop through all of the possible candidate nodes we found
|
635
|
+
* and find the one with the highest score.
|
636
|
+
**/
|
637
|
+
var topCandidate = null;
|
638
|
+
for(var c=0, cl=candidates.length; c < cl; c++)
|
639
|
+
{
|
640
|
+
/**
|
641
|
+
* Scale the final candidates score based on link density. Good content should have a
|
642
|
+
* relatively small link density (5% or less) and be mostly unaffected by this operation.
|
643
|
+
**/
|
644
|
+
candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
|
645
|
+
|
646
|
+
dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
|
647
|
+
|
648
|
+
if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
|
649
|
+
topCandidate = candidates[c]; }
|
650
|
+
}
|
651
|
+
|
652
|
+
/**
|
653
|
+
* If we still have no top candidate, just use the body as a last resort.
|
654
|
+
* We also have to copy the body node so it is something we can modify.
|
655
|
+
**/
|
656
|
+
if (topCandidate === null || topCandidate.tagName == "BODY")
|
657
|
+
{
|
658
|
+
topCandidate = document.createElement("DIV");
|
659
|
+
topCandidate.innerHTML = document.body.innerHTML;
|
660
|
+
document.body.innerHTML = "";
|
661
|
+
document.body.appendChild(topCandidate);
|
662
|
+
readability.initializeNode(topCandidate);
|
663
|
+
}
|
664
|
+
|
665
|
+
|
666
|
+
/**
|
667
|
+
* Now that we have the top candidate, look through its siblings for content that might also be related.
|
668
|
+
* Things like preambles, content split by ads that we removed, etc.
|
669
|
+
**/
|
670
|
+
var articleContent = document.createElement("DIV");
|
671
|
+
articleContent.id = "readability-content";
|
672
|
+
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
|
673
|
+
var siblingNodes = topCandidate.parentNode.childNodes;
|
674
|
+
for(var s=0, sl=siblingNodes.length; s < sl; s++)
|
675
|
+
{
|
676
|
+
var siblingNode = siblingNodes[s];
|
677
|
+
var append = false;
|
678
|
+
|
679
|
+
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
|
680
|
+
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
|
681
|
+
|
682
|
+
if(siblingNode === topCandidate)
|
683
|
+
{
|
684
|
+
append = true;
|
685
|
+
}
|
686
|
+
|
687
|
+
var contentBonus = 0;
|
688
|
+
/* Give a small bonus if sibling nodes and top candidates have the example same classname */
|
689
|
+
if(siblingNode.className == topCandidate.className && topCandidate.className != "") {
|
690
|
+
contentBonus += 10;
|
691
|
+
}
|
692
|
+
|
693
|
+
if(typeof siblingNode.readability != 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
|
694
|
+
{
|
695
|
+
append = true;
|
696
|
+
}
|
697
|
+
|
698
|
+
if(siblingNode.nodeName == "P") {
|
699
|
+
var linkDensity = readability.getLinkDensity(siblingNode);
|
700
|
+
var nodeContent = readability.getInnerText(siblingNode);
|
701
|
+
var nodeLength = nodeContent.length;
|
702
|
+
|
703
|
+
if(nodeLength > 80 && linkDensity < 0.25)
|
704
|
+
{
|
705
|
+
append = true;
|
706
|
+
}
|
707
|
+
else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
|
708
|
+
{
|
709
|
+
append = true;
|
710
|
+
}
|
711
|
+
}
|
712
|
+
|
713
|
+
if(append)
|
714
|
+
{
|
715
|
+
dbg("Appending node: " + siblingNode);
|
716
|
+
|
717
|
+
var nodeToAppend = null;
|
718
|
+
if(siblingNode.nodeName != "DIV" && siblingNode.nodeName != "P") {
|
719
|
+
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
|
720
|
+
|
721
|
+
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
|
722
|
+
nodeToAppend = document.createElement('div');
|
723
|
+
try {
|
724
|
+
nodeToAppend.id = siblingNode.id;
|
725
|
+
nodeToAppend.innerHTML = siblingNode.innerHTML;
|
726
|
+
}
|
727
|
+
catch(e)
|
728
|
+
{
|
729
|
+
dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
|
730
|
+
nodeToAppend = siblingNode;
|
731
|
+
s--;
|
732
|
+
sl--;
|
733
|
+
}
|
734
|
+
} else {
|
735
|
+
nodeToAppend = siblingNode;
|
736
|
+
s--;
|
737
|
+
sl--;
|
738
|
+
}
|
739
|
+
|
740
|
+
/* To ensure a node does not interfere with readability styles, remove its classnames */
|
741
|
+
nodeToAppend.className = "";
|
742
|
+
|
743
|
+
/* Append sibling and subtract from our list because it removes the node when you append to another node */
|
744
|
+
articleContent.appendChild(nodeToAppend);
|
745
|
+
}
|
746
|
+
}
|
747
|
+
|
748
|
+
/**
|
749
|
+
* So we have all of the content that we need. Now we clean it up for presentation.
|
750
|
+
**/
|
751
|
+
readability.prepArticle(articleContent);
|
752
|
+
|
753
|
+
return articleContent;
|
754
|
+
},
|
755
|
+
|
756
|
+
/**
|
757
|
+
* Get the inner text of a node - cross browser compatibly.
|
758
|
+
* This also strips out any excess whitespace to be found.
|
759
|
+
*
|
760
|
+
* @param Element
|
761
|
+
* @return string
|
762
|
+
**/
|
763
|
+
getInnerText: function (e, normalizeSpaces) {
|
764
|
+
var textContent = "";
|
765
|
+
|
766
|
+
normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
|
767
|
+
|
768
|
+
textContent = e.innerText.replace( readability.regexps.trimRe, "" );
|
769
|
+
|
770
|
+
if(normalizeSpaces) {
|
771
|
+
return textContent.replace( readability.regexps.normalizeRe, " "); }
|
772
|
+
else {
|
773
|
+
return textContent; }
|
774
|
+
},
|
775
|
+
|
776
|
+
/**
|
777
|
+
* Get the number of times a string s appears in the node e.
|
778
|
+
*
|
779
|
+
* @param Element
|
780
|
+
* @param string - what to split on. Default is ","
|
781
|
+
* @return number (integer)
|
782
|
+
**/
|
783
|
+
getCharCount: function (e,s) {
|
784
|
+
s = s || ",";
|
785
|
+
return readability.getInnerText(e).split(s).length-1;
|
786
|
+
},
|
787
|
+
|
788
|
+
/**
|
789
|
+
* Remove the style attribute on every e and under.
|
790
|
+
* TODO: Test if getElementsByTagName(*) is faster.
|
791
|
+
*
|
792
|
+
* @param Element
|
793
|
+
* @return void
|
794
|
+
**/
|
795
|
+
cleanStyles: function (e) {
|
796
|
+
e = e || document;
|
797
|
+
var cur = e.firstChild;
|
798
|
+
|
799
|
+
if(!e) {
|
800
|
+
return; }
|
801
|
+
|
802
|
+
// Remove any root styles, if we're able.
|
803
|
+
if(typeof e.removeAttribute == 'function' && e.className != 'readability-styled') {
|
804
|
+
e.removeAttribute('style'); }
|
805
|
+
|
806
|
+
// Go until there are no more child nodes
|
807
|
+
while ( cur !== null ) {
|
808
|
+
if ( cur.nodeType == 1 ) {
|
809
|
+
// Remove style attribute(s) :
|
810
|
+
if(cur.className != "readability-styled") {
|
811
|
+
cur.removeAttribute("style");
|
812
|
+
}
|
813
|
+
readability.cleanStyles( cur );
|
814
|
+
}
|
815
|
+
cur = cur.nextSibling;
|
816
|
+
}
|
817
|
+
},
|
818
|
+
|
819
|
+
/**
|
820
|
+
* Get the density of links as a percentage of the content
|
821
|
+
* This is the amount of text that is inside a link divided by the total text in the node.
|
822
|
+
*
|
823
|
+
* @param Element
|
824
|
+
* @return number (float)
|
825
|
+
**/
|
826
|
+
getLinkDensity: function (e) {
|
827
|
+
var links = e.getElementsByTagName("a");
|
828
|
+
var textLength = readability.getInnerText(e).length;
|
829
|
+
var linkLength = 0;
|
830
|
+
for(var i=0, il=links.length; i<il;i++)
|
831
|
+
{
|
832
|
+
linkLength += readability.getInnerText(links[i]).length;
|
833
|
+
}
|
834
|
+
|
835
|
+
return linkLength / textLength;
|
836
|
+
},
|
837
|
+
|
838
|
+
/**
|
839
|
+
* Get an elements class/id weight. Uses regular expressions to tell if this
|
840
|
+
* element looks good or bad.
|
841
|
+
*
|
842
|
+
* @param Element
|
843
|
+
* @return number (Integer)
|
844
|
+
**/
|
845
|
+
getClassWeight: function (e) {
|
846
|
+
if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
|
847
|
+
return 0;
|
848
|
+
}
|
849
|
+
|
850
|
+
var weight = 0;
|
851
|
+
|
852
|
+
/* Look for a special classname */
|
853
|
+
if (e.className != "")
|
854
|
+
{
|
855
|
+
if(e.className.search(readability.regexps.negativeRe) !== -1) {
|
856
|
+
weight -= 25; }
|
857
|
+
|
858
|
+
if(e.className.search(readability.regexps.positiveRe) !== -1) {
|
859
|
+
weight += 25; }
|
860
|
+
}
|
861
|
+
|
862
|
+
/* Look for a special ID */
|
863
|
+
if (typeof(e.id) == 'string' && e.id != "")
|
864
|
+
{
|
865
|
+
if(e.id.search(readability.regexps.negativeRe) !== -1) {
|
866
|
+
weight -= 25; }
|
867
|
+
|
868
|
+
if(e.id.search(readability.regexps.positiveRe) !== -1) {
|
869
|
+
weight += 25; }
|
870
|
+
}
|
871
|
+
|
872
|
+
return weight;
|
873
|
+
},
|
874
|
+
|
875
|
+
/**
|
876
|
+
* Remove extraneous break tags from a node.
|
877
|
+
*
|
878
|
+
* @param Element
|
879
|
+
* @return void
|
880
|
+
**/
|
881
|
+
killBreaks: function (e) {
|
882
|
+
try {
|
883
|
+
e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,'<br />');
|
884
|
+
}
|
885
|
+
catch (eBreaks) {
|
886
|
+
dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
|
887
|
+
}
|
888
|
+
},
|
889
|
+
|
890
|
+
/**
|
891
|
+
* Clean a node of all elements of type "tag".
|
892
|
+
* (Unless it's a youtube/vimeo video. People love movies.)
|
893
|
+
*
|
894
|
+
* @param Element
|
895
|
+
* @param string tag to clean
|
896
|
+
* @return void
|
897
|
+
**/
|
898
|
+
clean: function (e, tag) {
|
899
|
+
var targetList = e.getElementsByTagName( tag );
|
900
|
+
var isEmbed = (tag == 'object' || tag == 'embed');
|
901
|
+
|
902
|
+
for (var y=targetList.length-1; y >= 0; y--) {
|
903
|
+
/* Allow youtube and vimeo videos through as people usually want to see those. */
|
904
|
+
if(isEmbed) {
|
905
|
+
var attributeValues = "";
|
906
|
+
for (var i=0, il=targetList[y].attributes.length; i < il; i++) {
|
907
|
+
attributeValues += targetList[y].attributes[i].value + '|';
|
908
|
+
}
|
909
|
+
|
910
|
+
/* First, check the elements attributes to see if any of them contain youtube or vimeo */
|
911
|
+
if (attributeValues.search(readability.regexps.videoRe) !== -1) {
|
912
|
+
continue;
|
913
|
+
}
|
914
|
+
|
915
|
+
/* Then check the elements inside this element for the same. */
|
916
|
+
if (targetList[y].innerHTML.search(readability.regexps.videoRe) !== -1) {
|
917
|
+
continue;
|
918
|
+
}
|
919
|
+
|
920
|
+
}
|
921
|
+
|
922
|
+
targetList[y].parentNode.removeChild(targetList[y]);
|
923
|
+
}
|
924
|
+
},
|
925
|
+
|
926
|
+
/**
|
927
|
+
* Clean an element of all tags of type "tag" if they look fishy.
|
928
|
+
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
929
|
+
*
|
930
|
+
* @return void
|
931
|
+
**/
|
932
|
+
cleanConditionally: function (e, tag) {
|
933
|
+
var tagsList = e.getElementsByTagName(tag);
|
934
|
+
var curTagsLength = tagsList.length;
|
935
|
+
|
936
|
+
/**
|
937
|
+
* Gather counts for other typical elements embedded within.
|
938
|
+
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
|
939
|
+
*
|
940
|
+
* TODO: Consider taking into account original contentScore here.
|
941
|
+
**/
|
942
|
+
for (var i=curTagsLength-1; i >= 0; i--) {
|
943
|
+
var weight = readability.getClassWeight(tagsList[i]);
|
944
|
+
var contentScore = (typeof tagsList[i].readability != 'undefined') ? tagsList[i].readability.contentScore : 0;
|
945
|
+
|
946
|
+
dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
|
947
|
+
|
948
|
+
if(weight+contentScore < 0)
|
949
|
+
{
|
950
|
+
tagsList[i].parentNode.removeChild(tagsList[i]);
|
951
|
+
}
|
952
|
+
else if ( readability.getCharCount(tagsList[i],',') < 10) {
|
953
|
+
/**
|
954
|
+
* If there are not very many commas, and the number of
|
955
|
+
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
|
956
|
+
**/
|
957
|
+
var p = tagsList[i].getElementsByTagName("p").length;
|
958
|
+
var img = tagsList[i].getElementsByTagName("img").length;
|
959
|
+
var li = tagsList[i].getElementsByTagName("li").length-100;
|
960
|
+
var input = tagsList[i].getElementsByTagName("input").length;
|
961
|
+
|
962
|
+
var embedCount = 0;
|
963
|
+
var embeds = tagsList[i].getElementsByTagName("embed");
|
964
|
+
for(var ei=0,il=embeds.length; ei < il; ei++) {
|
965
|
+
if (embeds[ei].src.search(readability.regexps.videoRe) == -1) {
|
966
|
+
embedCount++;
|
967
|
+
}
|
968
|
+
}
|
969
|
+
|
970
|
+
var linkDensity = readability.getLinkDensity(tagsList[i]);
|
971
|
+
var contentLength = readability.getInnerText(tagsList[i]).length;
|
972
|
+
var toRemove = false;
|
973
|
+
|
974
|
+
if ( img > p ) {
|
975
|
+
toRemove = true;
|
976
|
+
} else if(li > p && tag != "ul" && tag != "ol") {
|
977
|
+
toRemove = true;
|
978
|
+
} else if( input > Math.floor(p/3) ) {
|
979
|
+
toRemove = true;
|
980
|
+
} else if(contentLength < 25 && (img === 0 || img > 2) ) {
|
981
|
+
toRemove = true;
|
982
|
+
} else if(weight < 25 && linkDensity > 0.2) {
|
983
|
+
toRemove = true;
|
984
|
+
} else if(weight >= 25 && linkDensity > 0.5) {
|
985
|
+
toRemove = true;
|
986
|
+
} else if((embedCount == 1 && contentLength < 75) || embedCount > 1) {
|
987
|
+
toRemove = true;
|
988
|
+
}
|
989
|
+
|
990
|
+
if(toRemove) {
|
991
|
+
tagsList[i].parentNode.removeChild(tagsList[i]);
|
992
|
+
}
|
993
|
+
}
|
994
|
+
}
|
995
|
+
},
|
996
|
+
|
997
|
+
/**
|
998
|
+
* Clean out spurious headers from an Element. Checks things like classnames and link density.
|
999
|
+
*
|
1000
|
+
* @param Element
|
1001
|
+
* @return void
|
1002
|
+
**/
|
1003
|
+
cleanHeaders: function (e) {
|
1004
|
+
for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
|
1005
|
+
var headers = e.getElementsByTagName('h' + headerIndex);
|
1006
|
+
for (var i=headers.length-1; i >=0; i--) {
|
1007
|
+
if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
|
1008
|
+
headers[i].parentNode.removeChild(headers[i]);
|
1009
|
+
}
|
1010
|
+
}
|
1011
|
+
}
|
1012
|
+
},
|
1013
|
+
|
1014
|
+
/**
|
1015
|
+
* Show the email popup.
|
1016
|
+
*
|
1017
|
+
* @return void
|
1018
|
+
**/
|
1019
|
+
emailBox: function () {
|
1020
|
+
var emailContainerExists = document.getElementById('email-container');
|
1021
|
+
if(null !== emailContainerExists)
|
1022
|
+
{
|
1023
|
+
return;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
var emailContainer = document.createElement('div');
|
1027
|
+
emailContainer.setAttribute('id', 'email-container');
|
1028
|
+
emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+escape(window.location)+'&pageTitle='+escape(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
|
1029
|
+
|
1030
|
+
document.body.appendChild(emailContainer);
|
1031
|
+
},
|
1032
|
+
|
1033
|
+
/**
|
1034
|
+
* Close the email popup. This is a hacktackular way to check if we're in a "close loop".
|
1035
|
+
* Since we don't have crossdomain access to the frame, we can only know when it has
|
1036
|
+
* loaded again. If it's loaded over 3 times, we know to close the frame.
|
1037
|
+
*
|
1038
|
+
* @return void
|
1039
|
+
**/
|
1040
|
+
removeFrame: function () {
|
1041
|
+
readability.iframeLoads++;
|
1042
|
+
if (readability.iframeLoads > 3)
|
1043
|
+
{
|
1044
|
+
var emailContainer = document.getElementById('email-container');
|
1045
|
+
if (null !== emailContainer) {
|
1046
|
+
emailContainer.parentNode.removeChild(emailContainer);
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
readability.iframeLoads = 0;
|
1050
|
+
}
|
1051
|
+
},
|
1052
|
+
|
1053
|
+
htmlspecialchars: function (s) {
|
1054
|
+
if (typeof(s) == "string") {
|
1055
|
+
s = s.replace(/&/g, "&");
|
1056
|
+
s = s.replace(/"/g, """);
|
1057
|
+
s = s.replace(/'/g, "'");
|
1058
|
+
s = s.replace(/</g, "<");
|
1059
|
+
s = s.replace(/>/g, ">");
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
return s;
|
1063
|
+
},
|
1064
|
+
|
1065
|
+
flagIsActive: function(flag) {
|
1066
|
+
return (readability.flags & flag) > 0;
|
1067
|
+
},
|
1068
|
+
|
1069
|
+
addFlag: function(flag) {
|
1070
|
+
readability.flags = readability.flags | flag;
|
1071
|
+
},
|
1072
|
+
|
1073
|
+
removeFlag: function(flag) {
|
1074
|
+
readability.flags = readability.flags & ~flag;
|
1075
|
+
}
|
1076
|
+
|
1077
|
+
};
|
1078
|
+
|
1079
|
+
readability.init();
|