rgabo-readability 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Gabor Ratky
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,34 @@
1
+ = readability
2
+
3
+ The readability gem makes it easy to run Arc90's Readability script in Ruby using Nokogiri and Harmony. Harmony uses Johnson to run env.js in Ruby.
4
+
5
+ Example:
6
+
7
+ require 'rubygems'
8
+ require 'readability'
9
+ require 'open-uri'
10
+
11
+ # load document with Nokogiri
12
+ doc = Nokogiri::HTML(open('http://ajaxian.com/archives/johnson-wrapping-javascript-in-a-loving-ruby-embrace-and-arax'))
13
+
14
+ # set Readability parameters
15
+ doc.read_style = Readability::Style::NEWSPAPER
16
+ doc.read_size = Readability::Size::MEDIUM
17
+ doc.read_margin = Readability::Margin::MEDIUM
18
+
19
+ # Print result after Readability has been run
20
+ puts doc.to_readable
21
+
22
+ == Note on Patches/Pull Requests
23
+
24
+ * Fork the project.
25
+ * Make your feature addition or bug fix.
26
+ * Add tests for it. This is important so I don't break it in a
27
+ future version unintentionally.
28
+ * Commit, do not mess with Rakefile, VERSION, or LICENSE.
29
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
30
+ * Send me a pull request. Bonus points for topic branches.
31
+
32
+ == Copyright
33
+
34
+ Copyright (c) 2010 Gabor Ratky. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,72 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rgabo-readability"
8
+ gem.summary = %Q{Run Arc90's Readability on Nokogiri documents}
9
+ gem.description = %Q{Extends Nokogiri::HTML::Document to run Arc90's Readability and procude easy to read HTML documents.}
10
+ gem.email = "rgabo@rgabostyle.com"
11
+ gem.homepage = "http://github.com/rgabo/readability"
12
+ gem.authors = ["Gabor Ratky"]
13
+ gem.add_development_dependency "rspec", ">= 1.3.0"
14
+ gem.add_runtime_dependency "harmony", "0.5.5"
15
+ gem.add_runtime_dependency "nokogiri", "~> 1.4"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ begin
38
+ require 'reek/adapters/rake_task'
39
+ Reek::RakeTask.new do |t|
40
+ t.fail_on_error = true
41
+ t.verbose = false
42
+ t.source_files = 'lib/**/*.rb'
43
+ end
44
+ rescue LoadError
45
+ task :reek do
46
+ abort "Reek is not available. In order to run reek, you must: gem install reek"
47
+ end
48
+ end
49
+
50
+ begin
51
+ require 'roodi'
52
+ require 'roodi_task'
53
+ RoodiTask.new do |t|
54
+ t.verbose = false
55
+ end
56
+ rescue LoadError
57
+ task :roodi do
58
+ abort "Roodi is not available. In order to run roodi, you must: gem install roodi"
59
+ end
60
+ end
61
+
62
+ task :default => :spec
63
+
64
+ require 'rake/rdoctask'
65
+ Rake::RDocTask.new do |rdoc|
66
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
67
+
68
+ rdoc.rdoc_dir = 'rdoc'
69
+ rdoc.title = "readability #{version}"
70
+ rdoc.rdoc_files.include('README*')
71
+ rdoc.rdoc_files.include('lib/**/*.rb')
72
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/example.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'readability'
3
+ require 'open-uri'
4
+
5
+ # load document with Nokogiri
6
+ doc = Nokogiri::HTML(open(ARGV.first))
7
+
8
+ # set Readability parameters
9
+ doc.read_style = Readability::Style::NEWSPAPER
10
+ doc.read_size = Readability::Size::MEDIUM
11
+ doc.read_margin = Readability::Margin::MEDIUM
12
+
13
+ # Print result after Readability has been run
14
+ puts doc.to_readable
@@ -0,0 +1,45 @@
1
+ # ensure that lib is in the load path
2
+ $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ require 'harmony'
7
+
8
+ require 'readability/harmonizable'
9
+ require 'readability/readable'
10
+
11
+ # Run the Arc90 Lab Experiment Readability on a Nokogiri document.
12
+ # TODO: Add example
13
+ #
14
+ module Readability
15
+ module Style
16
+ NEWSPAPER = "style-newspaper"
17
+ NOVEL = "style-novel"
18
+ EBOOK = "style-ebook"
19
+ TERMINAL = "style-terminal"
20
+ APERTURA = "style-apertura"
21
+ ATHELAS = "style-athelas"
22
+ end
23
+
24
+ module Size
25
+ XSMALL = "size-x-small"
26
+ SMALL = "size-small"
27
+ MEDIUM = "size-medium"
28
+ LARGE = "size-large"
29
+ XLARGE = "size-x-large"
30
+ end
31
+
32
+ module Margin
33
+ XNARROW = "margin-x-narrow"
34
+ NARROW = "margin-narrow"
35
+ MEDIUM = "margin-medium"
36
+ WIDE = "margin-wide"
37
+ XWIDE = "margin-x-wide"
38
+ end
39
+ end
40
+
41
+ class Nokogiri::HTML::Document
42
+ include Readability::Readable
43
+ end
44
+
45
+
@@ -0,0 +1,52 @@
1
+ module Readability
2
+ module Harmonizable
3
+ def window
4
+ if block_given?
5
+ harmony_page do |page|
6
+ yield page.window
7
+ page.window
8
+ end
9
+ else
10
+ harmony_page.window
11
+ end
12
+ end
13
+
14
+ def parse string_or_io, url = nil, encoding = nil, options = Nokogiri::XML::ParseOptions::DEFAULT_HTML, &block
15
+ self.root = Nokogiri::HTML::Document.parse(string_or_io, url, encoding, options, &block).root
16
+ end
17
+
18
+ def execute_js(code)
19
+ result = nil
20
+
21
+ harmony_page do |page|
22
+ result = page.execute_js(code)
23
+ end
24
+
25
+ result
26
+ end
27
+ alias :x :execute_js
28
+
29
+ def load_js(*paths)
30
+ harmony_page do |page|
31
+ page.load(*paths)
32
+ end
33
+
34
+ self
35
+ end
36
+
37
+ def harmony_page
38
+ # load document into a page
39
+ page = Harmony::Page.new(self.to_html)
40
+
41
+ # yield the page and reparse if a block is given
42
+ if block_given?
43
+ yield page
44
+
45
+ # parse the page back into the document
46
+ parse(page.to_html)
47
+ end
48
+
49
+ page
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,1079 @@
1
+ var dbg = function(s) {
2
+ if(typeof console !== 'undefined') {
3
+ console.log("Readability: " + s);
4
+ }
5
+ };
6
+
7
+ /*
8
+ * Readability. An Arc90 Lab Experiment.
9
+ * Website: http://lab.arc90.com/experiments/readability
10
+ * Source: http://code.google.com/p/arc90labs-readability
11
+ *
12
+ * Copyright (c) 2009 Arc90 Inc
13
+ * Readability is licensed under the Apache License, Version 2.0.
14
+ **/
15
+ var readability = {
16
+ version: '1.5.0',
17
+ emailSrc: 'http://lab.arc90.com/experiments/readability/email.php',
18
+ iframeLoads: 0,
19
+ frameHack: false, /**
20
+ * The frame hack is to workaround a firefox bug where if you
21
+ * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
22
+ * So we fake a scrollbar in the wrapping div.
23
+ **/
24
+ bodyCache: null, /* Cache the body HTML in case we need to re-use it later */
25
+ flags: 0x1 | 0x2, /* Start with both flags set. */
26
+
27
+ /* constants */
28
+ FLAG_STRIP_UNLIKELYS: 0x1,
29
+ FLAG_WEIGHT_CLASSES: 0x2,
30
+
31
+ /**
32
+ * All of the regular expressions in use within readability.
33
+ * Defined up here so we don't instantiate them repeatedly in loops.
34
+ **/
35
+ regexps: {
36
+ unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor/i,
37
+ okMaybeItsACandidateRe: /and|article|body|column|main/i,
38
+ positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
39
+ negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
40
+ divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
41
+ replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
42
+ replaceFontsRe: /<(\/?)font[^>]*>/gi,
43
+ trimRe: /^\s+|\s+$/g,
44
+ normalizeRe: /\s{2,}/g,
45
+ killBreaksRe: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
46
+ videoRe: /http:\/\/(www\.)?(youtube|vimeo)\.com/i
47
+ },
48
+
49
+ /**
50
+ * Runs readability.
51
+ *
52
+ * Workflow:
53
+ * 1. Prep the document by removing script tags, css, etc.
54
+ * 2. Build readability's DOM tree.
55
+ * 3. Grab the article content from the current dom tree.
56
+ * 4. Replace the current DOM tree with the new one.
57
+ * 5. Read peacefully.
58
+ *
59
+ * @return void
60
+ **/
61
+ init: function() {
62
+ document.body.style.display = "none";
63
+ if(document.body && !readability.bodyCache) {
64
+ readability.bodyCache = document.body.innerHTML; }
65
+
66
+ readability.prepDocument();
67
+
68
+ /* Build readability's DOM tree */
69
+ var overlay = document.createElement("DIV");
70
+ var innerDiv = document.createElement("DIV");
71
+ var articleTools = readability.getArticleTools();
72
+ var articleTitle = readability.getArticleTitle();
73
+ var articleContent = readability.grabArticle();
74
+ var articleFooter = readability.getArticleFooter();
75
+
76
+ /**
77
+ * If we attempted to strip unlikely candidates on the first run through, and we ended up with no content,
78
+ * that may mean we stripped out the actual content so we couldn't parse it. So re-run init while preserving
79
+ * unlikely candidates to have a better shot at getting our content out properly.
80
+ **/
81
+ if(readability.getInnerText(articleContent, false).length < 250)
82
+ {
83
+ if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
84
+ readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
85
+ document.body.innerHTML = readability.bodyCache;
86
+ return readability.init();
87
+ }
88
+ else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
89
+ readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
90
+ document.body.innerHTML = readability.bodyCache;
91
+ return readability.init();
92
+ }
93
+ else {
94
+ articleContent.innerHTML = "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p><p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>";
95
+ }
96
+ }
97
+
98
+ overlay.id = "readOverlay";
99
+ innerDiv.id = "readInner";
100
+
101
+ /* Apply user-selected styling */
102
+ document.body.className = readStyle;
103
+ if (readStyle == "style-athelas" || readStyle == "style-apertura"){
104
+ overlay.className = readStyle + " rdbTypekit";
105
+ }
106
+ else {
107
+ overlay.className = readStyle;
108
+ }
109
+ innerDiv.className = readMargin + " " + readSize;
110
+
111
+ /* Glue the structure of our document together. */
112
+ // articleContent.appendChild( articleFooter );
113
+ innerDiv.appendChild( articleTitle );
114
+ innerDiv.appendChild( articleContent );
115
+ innerDiv.appendChild( articleFooter );
116
+ overlay.appendChild( articleTools );
117
+ overlay.appendChild( innerDiv );
118
+
119
+ /* Clear the old HTML, insert the new content. */
120
+ document.body.innerHTML = "";
121
+ document.body.insertBefore(overlay, document.body.firstChild);
122
+
123
+ if(readability.frameHack)
124
+ {
125
+ var readOverlay = document.getElementById('readOverlay');
126
+ readOverlay.style.height = '100%';
127
+ readOverlay.style.overflow = 'auto';
128
+ }
129
+
130
+ /**
131
+ * If someone tries to use Readability on a site's root page, give them a warning about usage.
132
+ **/
133
+ if((window.location.protocol + "//" + window.location.host + "/") == window.location.href)
134
+ {
135
+ articleContent.style.display = "none";
136
+ var rootWarning = document.createElement('p');
137
+ rootWarning.id = "readability-warning";
138
+ rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
139
+ "If you'd like to try rendering this page anyways, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
140
+
141
+ innerDiv.insertBefore( rootWarning, articleContent );
142
+ }
143
+ document.body.style.display = "block";
144
+
145
+ window.scrollTo(0, 0);
146
+
147
+ /* If we're using the Typekit library, select the font */
148
+ if (readStyle == "style-athelas" || readStyle == "style-apertura") {
149
+ readability.useRdbTypekit();
150
+ }
151
+ },
152
+
153
+ /**
154
+ * Get the article tools Element that has buttons like reload, print, email.
155
+ *
156
+ * @return void
157
+ **/
158
+ getArticleTools: function () {
159
+ var articleTools = document.createElement("DIV");
160
+
161
+ articleTools.id = "readTools";
162
+ articleTools.innerHTML =
163
+ "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
164
+ "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
165
+ "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
166
+
167
+ return articleTools;
168
+ },
169
+
170
+ /**
171
+ * Get the article title as an H1.
172
+ *
173
+ * @return void
174
+ **/
175
+ getArticleTitle: function () {
176
+ var curTitle = "",
177
+ origTitle = "";
178
+
179
+ try {
180
+ curTitle = origTitle = document.title;
181
+
182
+ if(typeof curTitle != "string") { /* If they had an element with id "title" in their HTML */
183
+ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
184
+ }
185
+ }
186
+ catch(e) {}
187
+
188
+ if(curTitle.match(/ [\|\-] /))
189
+ {
190
+ curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
191
+
192
+ if(curTitle.split(' ').length < 3) {
193
+ curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
194
+ }
195
+ }
196
+ else if(curTitle.indexOf(': ') !== -1)
197
+ {
198
+ curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
199
+
200
+ if(curTitle.split(' ').length < 3) {
201
+ curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
202
+ }
203
+ }
204
+ else if(curTitle.length > 150 || curTitle.length < 15)
205
+ {
206
+ var hOnes = document.getElementsByTagName('h1');
207
+ if(hOnes.length == 1)
208
+ {
209
+ curTitle = readability.getInnerText(hOnes[0]);
210
+ }
211
+ }
212
+
213
+ curTitle = curTitle.replace( readability.regexps.trimRe, "" );
214
+
215
+ if(curTitle.split(' ').length <= 4) {
216
+ curTitle = origTitle;
217
+ }
218
+
219
+ var articleTitle = document.createElement("H1");
220
+ articleTitle.innerHTML = curTitle;
221
+
222
+ return articleTitle;
223
+ },
224
+
225
+ /**
226
+ * Get the footer with the readability mark etc.
227
+ *
228
+ * @return void
229
+ **/
230
+ getArticleFooter: function () {
231
+ var articleFooter = document.createElement("DIV");
232
+
233
+ /**
234
+ * For research purposes, generate an img src that contains the chosen readstyle etc,
235
+ * so we can generate aggregate stats and change styles based on them in the future
236
+ **/
237
+ // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
238
+ /* TODO: attach this to an image */
239
+
240
+ var twitterLink = document.createElement('a');
241
+ twitterLink.setAttribute('href','http://lab.arc90.com/experiments/readability');
242
+ twitterLink.setAttribute('id','footer-twitterLink');
243
+ twitterLink.setAttribute('title','Follow Arc90 on Twitter');
244
+ twitterLink.innerHTML = "Follow us on Twitter &raquo;";
245
+
246
+ articleFooter.id = "readFooter";
247
+ articleFooter.innerHTML =
248
+ "<div id='rdb-footer-left'>" +
249
+ "<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability &mdash; </a>" +
250
+ "<a href='http://www.arc90.com/' id='arc90-logo'>An Arc90 Laboratory Experiment</a>" +
251
+ "<span id='readability-url'> &mdash; http://lab.arc90.com/experiments/readability</span>" +
252
+ "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>" +
253
+ "</div>" +
254
+ "<div id='rdb-footer-right'>" +
255
+ "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>" +
256
+ "<span class='version'>Readability version " + readability.version + "</span>" +
257
+ "</div>";
258
+
259
+ // if (readStyle == ("style-athelas" || "style-apertura")) {
260
+ // console.log("Using Typekit Footer");
261
+ // getElementById("rdb-footer-logo").appendChild(twitterLink);
262
+ // }
263
+ // else {
264
+ // console.log("Using Normal Footer");
265
+ // articleFooter.getElementById("rdb-footer-right").appendChild(twitterLink);
266
+ // }
267
+
268
+ return articleFooter;
269
+ },
270
+
271
+ /**
272
+ * Prepare the HTML document for readability to scrape it.
273
+ * This includes things like stripping javascript, CSS, and handling terrible markup.
274
+ *
275
+ * @return void
276
+ **/
277
+ prepDocument: function () {
278
+ /**
279
+ * In some cases a body element can't be found (if the HTML is totally hosed for example)
280
+ * so we create a new body node and append it to the document.
281
+ */
282
+ if(document.body === null)
283
+ {
284
+ var body = document.createElement("body");
285
+ try {
286
+ document.body = body;
287
+ }
288
+ catch(e) {
289
+ document.documentElement.appendChild(body);
290
+ dbg(e);
291
+ }
292
+ }
293
+
294
+ var frames = document.getElementsByTagName('frame');
295
+ if(frames.length > 0)
296
+ {
297
+ var bestFrame = null;
298
+ var bestFrameSize = 0;
299
+ for(var frameIndex = 0; frameIndex < frames.length; frameIndex++)
300
+ {
301
+ var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
302
+ var canAccessFrame = false;
303
+ try {
304
+ frames[frameIndex].contentWindow.document.body;
305
+ canAccessFrame = true;
306
+ }
307
+ catch(eFrames) {
308
+ dbg(eFrames);
309
+ }
310
+
311
+ if(canAccessFrame && frameSize > bestFrameSize)
312
+ {
313
+ bestFrame = frames[frameIndex];
314
+ bestFrameSize = frameSize;
315
+ }
316
+ }
317
+
318
+ if(bestFrame)
319
+ {
320
+ var newBody = document.createElement('body');
321
+ newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
322
+ newBody.style.overflow = 'scroll';
323
+ document.body = newBody;
324
+
325
+ var frameset = document.getElementsByTagName('frameset')[0];
326
+ if(frameset) {
327
+ frameset.parentNode.removeChild(frameset); }
328
+
329
+ readability.frameHack = true;
330
+ }
331
+ }
332
+
333
+ /* remove all scripts that are not readability */
334
+ var scripts = document.getElementsByTagName('script');
335
+ for(var i = scripts.length-1; i >= 0; i--)
336
+ {
337
+ if(scripts[i].src == null || typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1))
338
+ {
339
+ scripts[i].parentNode.removeChild(scripts[i]);
340
+ }
341
+ }
342
+
343
+ /* remove all stylesheets */
344
+ for (var k=0;k < document.styleSheets.length; k++) {
345
+ if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") == -1) {
346
+ document.styleSheets[k].disabled = true;
347
+ }
348
+ }
349
+
350
+ /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
351
+ var styleTags = document.getElementsByTagName("style");
352
+ for (var st=0;st < styleTags.length; st++) {
353
+ if (navigator.appName != "Microsoft Internet Explorer") {
354
+ styleTags[st].textContent = ""; }
355
+ }
356
+
357
+ /* Turn all double br's into p's */
358
+ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
359
+ document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrsRe, '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
360
+ },
361
+
362
+ useRdbTypekit: function () {
363
+ var rdbHead = document.getElementsByTagName('head')[0];
364
+ var rdbTKScript = document.createElement('script');
365
+ var rdbTKCode = null;
366
+
367
+ var rdbTKLink = document.createElement('a');
368
+ rdbTKLink.setAttribute('class','rdbTK-powered');
369
+ rdbTKLink.setAttribute('title','Fonts by Typekit');
370
+ rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
371
+
372
+ if (readStyle == "style-athelas") {
373
+ rdbTKCode = "sxt6vzy";
374
+ dbg("Using Athelas Theme");
375
+
376
+ rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
377
+ rdbTKLink.setAttribute('id','rdb-athelas');
378
+ document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
379
+ }
380
+ if (readStyle == "style-apertura") {
381
+ rdbTKCode = "bae8ybu";
382
+ dbg("Using Inverse Theme");
383
+
384
+ rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
385
+ rdbTKLink.setAttribute('id','rdb-inverse');
386
+ document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
387
+ }
388
+
389
+ /**
390
+ * Setting new script tag attributes to pull Typekits libraries
391
+ **/
392
+ rdbTKScript.setAttribute('type','text/javascript');
393
+ rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
394
+ rdbTKScript.setAttribute('charset','UTF-8');
395
+ rdbHead.appendChild(rdbTKScript);
396
+
397
+ /**
398
+ * In the future, maybe try using the following experimental Callback function?:
399
+ * http://gist.github.com/192350
400
+ * &
401
+ * http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
402
+ **/
403
+ var typekitLoader = function() {
404
+ dbg("Looking for Typekit.");
405
+ if(typeof Typekit != "undefined") {
406
+ try {
407
+ dbg("Caught typekit");
408
+ Typekit.load();
409
+ clearInterval(window.typekitInterval);
410
+ } catch(e) {
411
+ dbg("Typekit error: " + e);
412
+ }
413
+ }
414
+ };
415
+
416
+ window.typekitInterval = window.setInterval(typekitLoader, 100);
417
+ },
418
+
419
+ /**
420
+ * Prepare the article node for display. Clean out any inline styles,
421
+ * iframes, forms, strip extraneous <p> tags, etc.
422
+ *
423
+ * @param Element
424
+ * @return void
425
+ **/
426
+ prepArticle: function (articleContent) {
427
+ readability.cleanStyles(articleContent);
428
+ readability.killBreaks(articleContent);
429
+
430
+ /* Clean out junk from the article content */
431
+ readability.clean(articleContent, "form");
432
+ readability.clean(articleContent, "object");
433
+ readability.clean(articleContent, "h1");
434
+ /**
435
+ * If there is only one h2, they are probably using it
436
+ * as a header and not a subheader, so remove it since we already have a header.
437
+ ***/
438
+ if(articleContent.getElementsByTagName('h2').length == 1) {
439
+ readability.clean(articleContent, "h2"); }
440
+ readability.clean(articleContent, "iframe");
441
+
442
+ readability.cleanHeaders(articleContent);
443
+
444
+ /* Do these last as the previous stuff may have removed junk that will affect these */
445
+ readability.cleanConditionally(articleContent, "table");
446
+ readability.cleanConditionally(articleContent, "ul");
447
+ readability.cleanConditionally(articleContent, "div");
448
+
449
+ /* Remove extra paragraphs */
450
+ var articleParagraphs = articleContent.getElementsByTagName('p');
451
+ for(var i = articleParagraphs.length-1; i >= 0; i--)
452
+ {
453
+ var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
454
+ var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
455
+ var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
456
+
457
+ if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '')
458
+ {
459
+ articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
460
+ }
461
+ }
462
+
463
+ try {
464
+ articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
465
+ }
466
+ catch (e) {
467
+ dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
468
+ }
469
+ },
470
+
471
+ /**
472
+ * Initialize a node with the readability object. Also checks the
473
+ * className/id for special names to add to its score.
474
+ *
475
+ * @param Element
476
+ * @return void
477
+ **/
478
+ initializeNode: function (node) {
479
+ node.readability = {"contentScore": 0};
480
+
481
+ switch(node.tagName) {
482
+ case 'DIV':
483
+ node.readability.contentScore += 5;
484
+ break;
485
+
486
+ case 'PRE':
487
+ case 'TD':
488
+ case 'BLOCKQUOTE':
489
+ node.readability.contentScore += 3;
490
+ break;
491
+
492
+ case 'ADDRESS':
493
+ case 'OL':
494
+ case 'UL':
495
+ case 'DL':
496
+ case 'DD':
497
+ case 'DT':
498
+ case 'LI':
499
+ case 'FORM':
500
+ node.readability.contentScore -= 3;
501
+ break;
502
+
503
+ case 'H1':
504
+ case 'H2':
505
+ case 'H3':
506
+ case 'H4':
507
+ case 'H5':
508
+ case 'H6':
509
+ case 'TH':
510
+ node.readability.contentScore -= 5;
511
+ break;
512
+ }
513
+
514
+ node.readability.contentScore += readability.getClassWeight(node);
515
+ },
516
+
517
+ /***
518
+ * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
519
+ * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
520
+ *
521
+ * @return Element
522
+ **/
523
+ grabArticle: function () {
524
+ var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS);
525
+
526
+ /**
527
+ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
528
+ * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
529
+ *
530
+ * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
531
+ * TODO: Shouldn't this be a reverse traversal?
532
+ **/
533
+ var node = null;
534
+ var nodesToScore = [];
535
+ for(var nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++)
536
+ {
537
+ /* Remove unlikely candidates */
538
+ if (stripUnlikelyCandidates) {
539
+ var unlikelyMatchString = node.className + node.id;
540
+ if (unlikelyMatchString.search(readability.regexps.unlikelyCandidatesRe) !== -1 &&
541
+ unlikelyMatchString.search(readability.regexps.okMaybeItsACandidateRe) == -1 &&
542
+ node.tagName !== "BODY")
543
+ {
544
+ dbg("Removing unlikely candidate - " + unlikelyMatchString);
545
+ node.parentNode.removeChild(node);
546
+ nodeIndex--;
547
+ continue;
548
+ }
549
+ }
550
+
551
+ if (node.tagName === "P" || node.tagName === "TD") {
552
+ nodesToScore[nodesToScore.length] = node;
553
+ }
554
+
555
+ /* Turn all divs that don't have children block level elements into p's */
556
+ if (node.tagName === "DIV") {
557
+ if (node.innerHTML.search(readability.regexps.divToPElementsRe) === -1) {
558
+ dbg("Altering div to p");
559
+ var newNode = document.createElement('p');
560
+ try {
561
+ newNode.innerHTML = node.innerHTML;
562
+ node.parentNode.replaceChild(newNode, node);
563
+ nodeIndex--;
564
+ }
565
+ catch(e) {
566
+ dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
567
+ }
568
+ }
569
+ else
570
+ {
571
+ /* EXPERIMENTAL */
572
+ for(var i = 0, il = node.childNodes.length; i < il; i++) {
573
+ var childNode = node.childNodes[i];
574
+ if(childNode.nodeType == 3) { // Node.TEXT_NODE
575
+ dbg("replacing text node with a p tag with the same content.");
576
+ var p = document.createElement('p');
577
+ p.innerHTML = childNode.nodeValue;
578
+ p.style.display = 'inline';
579
+ p.className = 'readability-styled';
580
+ childNode.parentNode.replaceChild(p, childNode);
581
+ }
582
+ }
583
+ }
584
+ }
585
+ }
586
+
587
+ /**
588
+ * Loop through all paragraphs, and assign a score to them based on how content-y they look.
589
+ * Then add their score to their parent node.
590
+ *
591
+ * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
592
+ **/
593
+ var candidates = [];
594
+ for (var pt=0; pt < nodesToScore.length; pt++) {
595
+ var parentNode = nodesToScore[pt].parentNode;
596
+ var grandParentNode = parentNode.parentNode;
597
+ var innerText = readability.getInnerText(nodesToScore[pt]);
598
+
599
+ /* If this paragraph is less than 25 characters, don't even count it. */
600
+ if(innerText.length < 25) {
601
+ continue; }
602
+
603
+ /* Initialize readability data for the parent. */
604
+ if(typeof parentNode.readability == 'undefined')
605
+ {
606
+ readability.initializeNode(parentNode);
607
+ candidates.push(parentNode);
608
+ }
609
+
610
+ /* Initialize readability data for the grandparent. */
611
+ if(typeof grandParentNode.readability == 'undefined')
612
+ {
613
+ readability.initializeNode(grandParentNode);
614
+ candidates.push(grandParentNode);
615
+ }
616
+
617
+ var contentScore = 0;
618
+
619
+ /* Add a point for the paragraph itself as a base. */
620
+ contentScore++;
621
+
622
+ /* Add points for any commas within this paragraph */
623
+ contentScore += innerText.split(',').length;
624
+
625
+ /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
626
+ contentScore += Math.min(Math.floor(innerText.length / 100), 3);
627
+
628
+ /* Add the score to the parent. The grandparent gets half. */
629
+ parentNode.readability.contentScore += contentScore;
630
+ grandParentNode.readability.contentScore += contentScore/2;
631
+ }
632
+
633
+ /**
634
+ * After we've calculated scores, loop through all of the possible candidate nodes we found
635
+ * and find the one with the highest score.
636
+ **/
637
+ var topCandidate = null;
638
+ for(var c=0, cl=candidates.length; c < cl; c++)
639
+ {
640
+ /**
641
+ * Scale the final candidates score based on link density. Good content should have a
642
+ * relatively small link density (5% or less) and be mostly unaffected by this operation.
643
+ **/
644
+ candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
645
+
646
+ dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
647
+
648
+ if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
649
+ topCandidate = candidates[c]; }
650
+ }
651
+
652
+ /**
653
+ * If we still have no top candidate, just use the body as a last resort.
654
+ * We also have to copy the body node so it is something we can modify.
655
+ **/
656
+ if (topCandidate === null || topCandidate.tagName == "BODY")
657
+ {
658
+ topCandidate = document.createElement("DIV");
659
+ topCandidate.innerHTML = document.body.innerHTML;
660
+ document.body.innerHTML = "";
661
+ document.body.appendChild(topCandidate);
662
+ readability.initializeNode(topCandidate);
663
+ }
664
+
665
+
666
+ /**
667
+ * Now that we have the top candidate, look through its siblings for content that might also be related.
668
+ * Things like preambles, content split by ads that we removed, etc.
669
+ **/
670
+ var articleContent = document.createElement("DIV");
671
+ articleContent.id = "readability-content";
672
+ var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
673
+ var siblingNodes = topCandidate.parentNode.childNodes;
674
+ for(var s=0, sl=siblingNodes.length; s < sl; s++)
675
+ {
676
+ var siblingNode = siblingNodes[s];
677
+ var append = false;
678
+
679
+ dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
680
+ dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
681
+
682
+ if(siblingNode === topCandidate)
683
+ {
684
+ append = true;
685
+ }
686
+
687
+ var contentBonus = 0;
688
+ /* Give a small bonus if sibling nodes and top candidates have the example same classname */
689
+ if(siblingNode.className == topCandidate.className && topCandidate.className != "") {
690
+ contentBonus += 10;
691
+ }
692
+
693
+ if(typeof siblingNode.readability != 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
694
+ {
695
+ append = true;
696
+ }
697
+
698
+ if(siblingNode.nodeName == "P") {
699
+ var linkDensity = readability.getLinkDensity(siblingNode);
700
+ var nodeContent = readability.getInnerText(siblingNode);
701
+ var nodeLength = nodeContent.length;
702
+
703
+ if(nodeLength > 80 && linkDensity < 0.25)
704
+ {
705
+ append = true;
706
+ }
707
+ else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
708
+ {
709
+ append = true;
710
+ }
711
+ }
712
+
713
+ if(append)
714
+ {
715
+ dbg("Appending node: " + siblingNode);
716
+
717
+ var nodeToAppend = null;
718
+ if(siblingNode.nodeName != "DIV" && siblingNode.nodeName != "P") {
719
+ /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
720
+
721
+ dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
722
+ nodeToAppend = document.createElement('div');
723
+ try {
724
+ nodeToAppend.id = siblingNode.id;
725
+ nodeToAppend.innerHTML = siblingNode.innerHTML;
726
+ }
727
+ catch(e)
728
+ {
729
+ dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
730
+ nodeToAppend = siblingNode;
731
+ s--;
732
+ sl--;
733
+ }
734
+ } else {
735
+ nodeToAppend = siblingNode;
736
+ s--;
737
+ sl--;
738
+ }
739
+
740
+ /* To ensure a node does not interfere with readability styles, remove its classnames */
741
+ nodeToAppend.className = "";
742
+
743
+ /* Append sibling and subtract from our list because it removes the node when you append to another node */
744
+ articleContent.appendChild(nodeToAppend);
745
+ }
746
+ }
747
+
748
+ /**
749
+ * So we have all of the content that we need. Now we clean it up for presentation.
750
+ **/
751
+ readability.prepArticle(articleContent);
752
+
753
+ return articleContent;
754
+ },
755
+
756
+ /**
757
+ * Get the inner text of a node - cross browser compatibly.
758
+ * This also strips out any excess whitespace to be found.
759
+ *
760
+ * @param Element
761
+ * @return string
762
+ **/
763
+ getInnerText: function (e, normalizeSpaces) {
764
+ var textContent = "";
765
+
766
+ normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
767
+
768
+ textContent = e.innerText.replace( readability.regexps.trimRe, "" );
769
+
770
+ if(normalizeSpaces) {
771
+ return textContent.replace( readability.regexps.normalizeRe, " "); }
772
+ else {
773
+ return textContent; }
774
+ },
775
+
776
+ /**
777
+ * Get the number of times a string s appears in the node e.
778
+ *
779
+ * @param Element
780
+ * @param string - what to split on. Default is ","
781
+ * @return number (integer)
782
+ **/
783
+ getCharCount: function (e,s) {
784
+ s = s || ",";
785
+ return readability.getInnerText(e).split(s).length-1;
786
+ },
787
+
788
+ /**
789
+ * Remove the style attribute on every e and under.
790
+ * TODO: Test if getElementsByTagName(*) is faster.
791
+ *
792
+ * @param Element
793
+ * @return void
794
+ **/
795
+ cleanStyles: function (e) {
796
+ e = e || document;
797
+ var cur = e.firstChild;
798
+
799
+ if(!e) {
800
+ return; }
801
+
802
+ // Remove any root styles, if we're able.
803
+ if(typeof e.removeAttribute == 'function' && e.className != 'readability-styled') {
804
+ e.removeAttribute('style'); }
805
+
806
+ // Go until there are no more child nodes
807
+ while ( cur !== null ) {
808
+ if ( cur.nodeType == 1 ) {
809
+ // Remove style attribute(s) :
810
+ if(cur.className != "readability-styled") {
811
+ cur.removeAttribute("style");
812
+ }
813
+ readability.cleanStyles( cur );
814
+ }
815
+ cur = cur.nextSibling;
816
+ }
817
+ },
818
+
819
+ /**
820
+ * Get the density of links as a percentage of the content
821
+ * This is the amount of text that is inside a link divided by the total text in the node.
822
+ *
823
+ * @param Element
824
+ * @return number (float)
825
+ **/
826
+ getLinkDensity: function (e) {
827
+ var links = e.getElementsByTagName("a");
828
+ var textLength = readability.getInnerText(e).length;
829
+ var linkLength = 0;
830
+ for(var i=0, il=links.length; i<il;i++)
831
+ {
832
+ linkLength += readability.getInnerText(links[i]).length;
833
+ }
834
+
835
+ return linkLength / textLength;
836
+ },
837
+
838
+ /**
839
+ * Get an elements class/id weight. Uses regular expressions to tell if this
840
+ * element looks good or bad.
841
+ *
842
+ * @param Element
843
+ * @return number (Integer)
844
+ **/
845
+ getClassWeight: function (e) {
846
+ if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
847
+ return 0;
848
+ }
849
+
850
+ var weight = 0;
851
+
852
+ /* Look for a special classname */
853
+ if (e.className != "")
854
+ {
855
+ if(e.className.search(readability.regexps.negativeRe) !== -1) {
856
+ weight -= 25; }
857
+
858
+ if(e.className.search(readability.regexps.positiveRe) !== -1) {
859
+ weight += 25; }
860
+ }
861
+
862
+ /* Look for a special ID */
863
+ if (typeof(e.id) == 'string' && e.id != "")
864
+ {
865
+ if(e.id.search(readability.regexps.negativeRe) !== -1) {
866
+ weight -= 25; }
867
+
868
+ if(e.id.search(readability.regexps.positiveRe) !== -1) {
869
+ weight += 25; }
870
+ }
871
+
872
+ return weight;
873
+ },
874
+
875
+ /**
876
+ * Remove extraneous break tags from a node.
877
+ *
878
+ * @param Element
879
+ * @return void
880
+ **/
881
+ killBreaks: function (e) {
882
+ try {
883
+ e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,'<br />');
884
+ }
885
+ catch (eBreaks) {
886
+ dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
887
+ }
888
+ },
889
+
890
+ /**
891
+ * Clean a node of all elements of type "tag".
892
+ * (Unless it's a youtube/vimeo video. People love movies.)
893
+ *
894
+ * @param Element
895
+ * @param string tag to clean
896
+ * @return void
897
+ **/
898
+ clean: function (e, tag) {
899
+ var targetList = e.getElementsByTagName( tag );
900
+ var isEmbed = (tag == 'object' || tag == 'embed');
901
+
902
+ for (var y=targetList.length-1; y >= 0; y--) {
903
+ /* Allow youtube and vimeo videos through as people usually want to see those. */
904
+ if(isEmbed) {
905
+ var attributeValues = "";
906
+ for (var i=0, il=targetList[y].attributes.length; i < il; i++) {
907
+ attributeValues += targetList[y].attributes[i].value + '|';
908
+ }
909
+
910
+ /* First, check the elements attributes to see if any of them contain youtube or vimeo */
911
+ if (attributeValues.search(readability.regexps.videoRe) !== -1) {
912
+ continue;
913
+ }
914
+
915
+ /* Then check the elements inside this element for the same. */
916
+ if (targetList[y].innerHTML.search(readability.regexps.videoRe) !== -1) {
917
+ continue;
918
+ }
919
+
920
+ }
921
+
922
+ targetList[y].parentNode.removeChild(targetList[y]);
923
+ }
924
+ },
925
+
926
+ /**
927
+ * Clean an element of all tags of type "tag" if they look fishy.
928
+ * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
929
+ *
930
+ * @return void
931
+ **/
932
+ cleanConditionally: function (e, tag) {
933
+ var tagsList = e.getElementsByTagName(tag);
934
+ var curTagsLength = tagsList.length;
935
+
936
+ /**
937
+ * Gather counts for other typical elements embedded within.
938
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
939
+ *
940
+ * TODO: Consider taking into account original contentScore here.
941
+ **/
942
+ for (var i=curTagsLength-1; i >= 0; i--) {
943
+ var weight = readability.getClassWeight(tagsList[i]);
944
+ var contentScore = (typeof tagsList[i].readability != 'undefined') ? tagsList[i].readability.contentScore : 0;
945
+
946
+ dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
947
+
948
+ if(weight+contentScore < 0)
949
+ {
950
+ tagsList[i].parentNode.removeChild(tagsList[i]);
951
+ }
952
+ else if ( readability.getCharCount(tagsList[i],',') < 10) {
953
+ /**
954
+ * If there are not very many commas, and the number of
955
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
956
+ **/
957
+ var p = tagsList[i].getElementsByTagName("p").length;
958
+ var img = tagsList[i].getElementsByTagName("img").length;
959
+ var li = tagsList[i].getElementsByTagName("li").length-100;
960
+ var input = tagsList[i].getElementsByTagName("input").length;
961
+
962
+ var embedCount = 0;
963
+ var embeds = tagsList[i].getElementsByTagName("embed");
964
+ for(var ei=0,il=embeds.length; ei < il; ei++) {
965
+ if (embeds[ei].src.search(readability.regexps.videoRe) == -1) {
966
+ embedCount++;
967
+ }
968
+ }
969
+
970
+ var linkDensity = readability.getLinkDensity(tagsList[i]);
971
+ var contentLength = readability.getInnerText(tagsList[i]).length;
972
+ var toRemove = false;
973
+
974
+ if ( img > p ) {
975
+ toRemove = true;
976
+ } else if(li > p && tag != "ul" && tag != "ol") {
977
+ toRemove = true;
978
+ } else if( input > Math.floor(p/3) ) {
979
+ toRemove = true;
980
+ } else if(contentLength < 25 && (img === 0 || img > 2) ) {
981
+ toRemove = true;
982
+ } else if(weight < 25 && linkDensity > 0.2) {
983
+ toRemove = true;
984
+ } else if(weight >= 25 && linkDensity > 0.5) {
985
+ toRemove = true;
986
+ } else if((embedCount == 1 && contentLength < 75) || embedCount > 1) {
987
+ toRemove = true;
988
+ }
989
+
990
+ if(toRemove) {
991
+ tagsList[i].parentNode.removeChild(tagsList[i]);
992
+ }
993
+ }
994
+ }
995
+ },
996
+
997
+ /**
998
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
999
+ *
1000
+ * @param Element
1001
+ * @return void
1002
+ **/
1003
+ cleanHeaders: function (e) {
1004
+ for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
1005
+ var headers = e.getElementsByTagName('h' + headerIndex);
1006
+ for (var i=headers.length-1; i >=0; i--) {
1007
+ if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1008
+ headers[i].parentNode.removeChild(headers[i]);
1009
+ }
1010
+ }
1011
+ }
1012
+ },
1013
+
1014
+ /**
1015
+ * Show the email popup.
1016
+ *
1017
+ * @return void
1018
+ **/
1019
+ emailBox: function () {
1020
+ var emailContainerExists = document.getElementById('email-container');
1021
+ if(null !== emailContainerExists)
1022
+ {
1023
+ return;
1024
+ }
1025
+
1026
+ var emailContainer = document.createElement('div');
1027
+ emailContainer.setAttribute('id', 'email-container');
1028
+ emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+escape(window.location)+'&pageTitle='+escape(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
1029
+
1030
+ document.body.appendChild(emailContainer);
1031
+ },
1032
+
1033
+ /**
1034
+ * Close the email popup. This is a hacktackular way to check if we're in a "close loop".
1035
+ * Since we don't have crossdomain access to the frame, we can only know when it has
1036
+ * loaded again. If it's loaded over 3 times, we know to close the frame.
1037
+ *
1038
+ * @return void
1039
+ **/
1040
+ removeFrame: function () {
1041
+ readability.iframeLoads++;
1042
+ if (readability.iframeLoads > 3)
1043
+ {
1044
+ var emailContainer = document.getElementById('email-container');
1045
+ if (null !== emailContainer) {
1046
+ emailContainer.parentNode.removeChild(emailContainer);
1047
+ }
1048
+
1049
+ readability.iframeLoads = 0;
1050
+ }
1051
+ },
1052
+
1053
+ htmlspecialchars: function (s) {
1054
+ if (typeof(s) == "string") {
1055
+ s = s.replace(/&/g, "&amp;");
1056
+ s = s.replace(/"/g, "&quot;");
1057
+ s = s.replace(/'/g, "&#039;");
1058
+ s = s.replace(/</g, "&lt;");
1059
+ s = s.replace(/>/g, "&gt;");
1060
+ }
1061
+
1062
+ return s;
1063
+ },
1064
+
1065
+ flagIsActive: function(flag) {
1066
+ return (readability.flags & flag) > 0;
1067
+ },
1068
+
1069
+ addFlag: function(flag) {
1070
+ readability.flags = readability.flags | flag;
1071
+ },
1072
+
1073
+ removeFlag: function(flag) {
1074
+ readability.flags = readability.flags & ~flag;
1075
+ }
1076
+
1077
+ };
1078
+
1079
+ readability.init();