rgabo-readability 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Gabor Ratky
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,34 @@
1
+ = readability
2
+
3
+ The readability gem makes it easy to run Arc90's Readability script in Ruby using Nokogiri and Harmony. Harmony uses Johnson to run env.js in Ruby.
4
+
5
+ Example:
6
+
7
+ require 'rubygems'
8
+ require 'readability'
9
+ require 'open-uri'
10
+
11
+ # load document with Nokogiri
12
+ doc = Nokogiri::HTML(open('http://ajaxian.com/archives/johnson-wrapping-javascript-in-a-loving-ruby-embrace-and-arax'))
13
+
14
+ # set Readability parameters
15
+ doc.read_style = Readability::Style::NEWSPAPER
16
+ doc.read_size = Readability::Size::MEDIUM
17
+ doc.read_margin = Readability::Margin::MEDIUM
18
+
19
+ # Print result after Readability has been run
20
+ puts doc.to_readable
21
+
22
+ == Note on Patches/Pull Requests
23
+
24
+ * Fork the project.
25
+ * Make your feature addition or bug fix.
26
+ * Add tests for it. This is important so I don't break it in a
27
+ future version unintentionally.
28
+ * Commit, do not mess with Rakefile, VERSION, or LICENSE.
29
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
30
+ * Send me a pull request. Bonus points for topic branches.
31
+
32
+ == Copyright
33
+
34
+ Copyright (c) 2010 Gabor Ratky. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,72 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rgabo-readability"
8
+ gem.summary = %Q{Run Arc90's Readability on Nokogiri documents}
9
+ gem.description = %Q{Extends Nokogiri::HTML::Document to run Arc90's Readability and procude easy to read HTML documents.}
10
+ gem.email = "rgabo@rgabostyle.com"
11
+ gem.homepage = "http://github.com/rgabo/readability"
12
+ gem.authors = ["Gabor Ratky"]
13
+ gem.add_development_dependency "rspec", ">= 1.3.0"
14
+ gem.add_runtime_dependency "harmony", "0.5.5"
15
+ gem.add_runtime_dependency "nokogiri", "~> 1.4"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ begin
38
+ require 'reek/adapters/rake_task'
39
+ Reek::RakeTask.new do |t|
40
+ t.fail_on_error = true
41
+ t.verbose = false
42
+ t.source_files = 'lib/**/*.rb'
43
+ end
44
+ rescue LoadError
45
+ task :reek do
46
+ abort "Reek is not available. In order to run reek, you must: gem install reek"
47
+ end
48
+ end
49
+
50
+ begin
51
+ require 'roodi'
52
+ require 'roodi_task'
53
+ RoodiTask.new do |t|
54
+ t.verbose = false
55
+ end
56
+ rescue LoadError
57
+ task :roodi do
58
+ abort "Roodi is not available. In order to run roodi, you must: gem install roodi"
59
+ end
60
+ end
61
+
62
+ task :default => :spec
63
+
64
+ require 'rake/rdoctask'
65
+ Rake::RDocTask.new do |rdoc|
66
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
67
+
68
+ rdoc.rdoc_dir = 'rdoc'
69
+ rdoc.title = "readability #{version}"
70
+ rdoc.rdoc_files.include('README*')
71
+ rdoc.rdoc_files.include('lib/**/*.rb')
72
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/example.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'readability'
3
+ require 'open-uri'
4
+
5
+ # load document with Nokogiri
6
+ doc = Nokogiri::HTML(open(ARGV.first))
7
+
8
+ # set Readability parameters
9
+ doc.read_style = Readability::Style::NEWSPAPER
10
+ doc.read_size = Readability::Size::MEDIUM
11
+ doc.read_margin = Readability::Margin::MEDIUM
12
+
13
+ # Print result after Readability has been run
14
+ puts doc.to_readable
@@ -0,0 +1,45 @@
1
+ # ensure that lib is in the load path
2
+ $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ require 'harmony'
7
+
8
+ require 'readability/harmonizable'
9
+ require 'readability/readable'
10
+
11
+ # Run the Arc90 Lab Experiment Readability on a Nokogiri document.
12
+ # TODO: Add example
13
+ #
14
+ module Readability
15
+ module Style
16
+ NEWSPAPER = "style-newspaper"
17
+ NOVEL = "style-novel"
18
+ EBOOK = "style-ebook"
19
+ TERMINAL = "style-terminal"
20
+ APERTURA = "style-apertura"
21
+ ATHELAS = "style-athelas"
22
+ end
23
+
24
+ module Size
25
+ XSMALL = "size-x-small"
26
+ SMALL = "size-small"
27
+ MEDIUM = "size-medium"
28
+ LARGE = "size-large"
29
+ XLARGE = "size-x-large"
30
+ end
31
+
32
+ module Margin
33
+ XNARROW = "margin-x-narrow"
34
+ NARROW = "margin-narrow"
35
+ MEDIUM = "margin-medium"
36
+ WIDE = "margin-wide"
37
+ XWIDE = "margin-x-wide"
38
+ end
39
+ end
40
+
41
+ class Nokogiri::HTML::Document
42
+ include Readability::Readable
43
+ end
44
+
45
+
@@ -0,0 +1,52 @@
1
+ module Readability
2
+ module Harmonizable
3
+ def window
4
+ if block_given?
5
+ harmony_page do |page|
6
+ yield page.window
7
+ page.window
8
+ end
9
+ else
10
+ harmony_page.window
11
+ end
12
+ end
13
+
14
+ def parse string_or_io, url = nil, encoding = nil, options = Nokogiri::XML::ParseOptions::DEFAULT_HTML, &block
15
+ self.root = Nokogiri::HTML::Document.parse(string_or_io, url, encoding, options, &block).root
16
+ end
17
+
18
+ def execute_js(code)
19
+ result = nil
20
+
21
+ harmony_page do |page|
22
+ result = page.execute_js(code)
23
+ end
24
+
25
+ result
26
+ end
27
+ alias :x :execute_js
28
+
29
+ def load_js(*paths)
30
+ harmony_page do |page|
31
+ page.load(*paths)
32
+ end
33
+
34
+ self
35
+ end
36
+
37
+ def harmony_page
38
+ # load document into a page
39
+ page = Harmony::Page.new(self.to_html)
40
+
41
+ # yield the page and reparse if a block is given
42
+ if block_given?
43
+ yield page
44
+
45
+ # parse the page back into the document
46
+ parse(page.to_html)
47
+ end
48
+
49
+ page
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,1079 @@
1
+ var dbg = function(s) {
2
+ if(typeof console !== 'undefined') {
3
+ console.log("Readability: " + s);
4
+ }
5
+ };
6
+
7
+ /*
8
+ * Readability. An Arc90 Lab Experiment.
9
+ * Website: http://lab.arc90.com/experiments/readability
10
+ * Source: http://code.google.com/p/arc90labs-readability
11
+ *
12
+ * Copyright (c) 2009 Arc90 Inc
13
+ * Readability is licensed under the Apache License, Version 2.0.
14
+ **/
15
+ var readability = {
16
+ version: '1.5.0',
17
+ emailSrc: 'http://lab.arc90.com/experiments/readability/email.php',
18
+ iframeLoads: 0,
19
+ frameHack: false, /**
20
+ * The frame hack is to workaround a firefox bug where if you
21
+ * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
22
+ * So we fake a scrollbar in the wrapping div.
23
+ **/
24
+ bodyCache: null, /* Cache the body HTML in case we need to re-use it later */
25
+ flags: 0x1 | 0x2, /* Start with both flags set. */
26
+
27
+ /* constants */
28
+ FLAG_STRIP_UNLIKELYS: 0x1,
29
+ FLAG_WEIGHT_CLASSES: 0x2,
30
+
31
+ /**
32
+ * All of the regular expressions in use within readability.
33
+ * Defined up here so we don't instantiate them repeatedly in loops.
34
+ **/
35
+ regexps: {
36
+ unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor/i,
37
+ okMaybeItsACandidateRe: /and|article|body|column|main/i,
38
+ positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
39
+ negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
40
+ divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
41
+ replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
42
+ replaceFontsRe: /<(\/?)font[^>]*>/gi,
43
+ trimRe: /^\s+|\s+$/g,
44
+ normalizeRe: /\s{2,}/g,
45
+ killBreaksRe: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
46
+ videoRe: /http:\/\/(www\.)?(youtube|vimeo)\.com/i
47
+ },
48
+
49
+ /**
50
+ * Runs readability.
51
+ *
52
+ * Workflow:
53
+ * 1. Prep the document by removing script tags, css, etc.
54
+ * 2. Build readability's DOM tree.
55
+ * 3. Grab the article content from the current dom tree.
56
+ * 4. Replace the current DOM tree with the new one.
57
+ * 5. Read peacefully.
58
+ *
59
+ * @return void
60
+ **/
61
+ init: function() {
62
+ document.body.style.display = "none";
63
+ if(document.body && !readability.bodyCache) {
64
+ readability.bodyCache = document.body.innerHTML; }
65
+
66
+ readability.prepDocument();
67
+
68
+ /* Build readability's DOM tree */
69
+ var overlay = document.createElement("DIV");
70
+ var innerDiv = document.createElement("DIV");
71
+ var articleTools = readability.getArticleTools();
72
+ var articleTitle = readability.getArticleTitle();
73
+ var articleContent = readability.grabArticle();
74
+ var articleFooter = readability.getArticleFooter();
75
+
76
+ /**
77
+ * If we attempted to strip unlikely candidates on the first run through, and we ended up with no content,
78
+ * that may mean we stripped out the actual content so we couldn't parse it. So re-run init while preserving
79
+ * unlikely candidates to have a better shot at getting our content out properly.
80
+ **/
81
+ if(readability.getInnerText(articleContent, false).length < 250)
82
+ {
83
+ if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
84
+ readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
85
+ document.body.innerHTML = readability.bodyCache;
86
+ return readability.init();
87
+ }
88
+ else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
89
+ readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
90
+ document.body.innerHTML = readability.bodyCache;
91
+ return readability.init();
92
+ }
93
+ else {
94
+ articleContent.innerHTML = "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p><p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>";
95
+ }
96
+ }
97
+
98
+ overlay.id = "readOverlay";
99
+ innerDiv.id = "readInner";
100
+
101
+ /* Apply user-selected styling */
102
+ document.body.className = readStyle;
103
+ if (readStyle == "style-athelas" || readStyle == "style-apertura"){
104
+ overlay.className = readStyle + " rdbTypekit";
105
+ }
106
+ else {
107
+ overlay.className = readStyle;
108
+ }
109
+ innerDiv.className = readMargin + " " + readSize;
110
+
111
+ /* Glue the structure of our document together. */
112
+ // articleContent.appendChild( articleFooter );
113
+ innerDiv.appendChild( articleTitle );
114
+ innerDiv.appendChild( articleContent );
115
+ innerDiv.appendChild( articleFooter );
116
+ overlay.appendChild( articleTools );
117
+ overlay.appendChild( innerDiv );
118
+
119
+ /* Clear the old HTML, insert the new content. */
120
+ document.body.innerHTML = "";
121
+ document.body.insertBefore(overlay, document.body.firstChild);
122
+
123
+ if(readability.frameHack)
124
+ {
125
+ var readOverlay = document.getElementById('readOverlay');
126
+ readOverlay.style.height = '100%';
127
+ readOverlay.style.overflow = 'auto';
128
+ }
129
+
130
+ /**
131
+ * If someone tries to use Readability on a site's root page, give them a warning about usage.
132
+ **/
133
+ if((window.location.protocol + "//" + window.location.host + "/") == window.location.href)
134
+ {
135
+ articleContent.style.display = "none";
136
+ var rootWarning = document.createElement('p');
137
+ rootWarning.id = "readability-warning";
138
+ rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
139
+ "If you'd like to try rendering this page anyways, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
140
+
141
+ innerDiv.insertBefore( rootWarning, articleContent );
142
+ }
143
+ document.body.style.display = "block";
144
+
145
+ window.scrollTo(0, 0);
146
+
147
+ /* If we're using the Typekit library, select the font */
148
+ if (readStyle == "style-athelas" || readStyle == "style-apertura") {
149
+ readability.useRdbTypekit();
150
+ }
151
+ },
152
+
153
+ /**
154
+ * Get the article tools Element that has buttons like reload, print, email.
155
+ *
156
+ * @return void
157
+ **/
158
+ getArticleTools: function () {
159
+ var articleTools = document.createElement("DIV");
160
+
161
+ articleTools.id = "readTools";
162
+ articleTools.innerHTML =
163
+ "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
164
+ "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
165
+ "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
166
+
167
+ return articleTools;
168
+ },
169
+
170
+ /**
171
+ * Get the article title as an H1.
172
+ *
173
+ * @return void
174
+ **/
175
+ getArticleTitle: function () {
176
+ var curTitle = "",
177
+ origTitle = "";
178
+
179
+ try {
180
+ curTitle = origTitle = document.title;
181
+
182
+ if(typeof curTitle != "string") { /* If they had an element with id "title" in their HTML */
183
+ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
184
+ }
185
+ }
186
+ catch(e) {}
187
+
188
+ if(curTitle.match(/ [\|\-] /))
189
+ {
190
+ curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
191
+
192
+ if(curTitle.split(' ').length < 3) {
193
+ curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
194
+ }
195
+ }
196
+ else if(curTitle.indexOf(': ') !== -1)
197
+ {
198
+ curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
199
+
200
+ if(curTitle.split(' ').length < 3) {
201
+ curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
202
+ }
203
+ }
204
+ else if(curTitle.length > 150 || curTitle.length < 15)
205
+ {
206
+ var hOnes = document.getElementsByTagName('h1');
207
+ if(hOnes.length == 1)
208
+ {
209
+ curTitle = readability.getInnerText(hOnes[0]);
210
+ }
211
+ }
212
+
213
+ curTitle = curTitle.replace( readability.regexps.trimRe, "" );
214
+
215
+ if(curTitle.split(' ').length <= 4) {
216
+ curTitle = origTitle;
217
+ }
218
+
219
+ var articleTitle = document.createElement("H1");
220
+ articleTitle.innerHTML = curTitle;
221
+
222
+ return articleTitle;
223
+ },
224
+
225
+ /**
226
+ * Get the footer with the readability mark etc.
227
+ *
228
+ * @return void
229
+ **/
230
+ getArticleFooter: function () {
231
+ var articleFooter = document.createElement("DIV");
232
+
233
+ /**
234
+ * For research purposes, generate an img src that contains the chosen readstyle etc,
235
+ * so we can generate aggregate stats and change styles based on them in the future
236
+ **/
237
+ // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
238
+ /* TODO: attach this to an image */
239
+
240
+ var twitterLink = document.createElement('a');
241
+ twitterLink.setAttribute('href','http://lab.arc90.com/experiments/readability');
242
+ twitterLink.setAttribute('id','footer-twitterLink');
243
+ twitterLink.setAttribute('title','Follow Arc90 on Twitter');
244
+ twitterLink.innerHTML = "Follow us on Twitter &raquo;";
245
+
246
+ articleFooter.id = "readFooter";
247
+ articleFooter.innerHTML =
248
+ "<div id='rdb-footer-left'>" +
249
+ "<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability &mdash; </a>" +
250
+ "<a href='http://www.arc90.com/' id='arc90-logo'>An Arc90 Laboratory Experiment</a>" +
251
+ "<span id='readability-url'> &mdash; http://lab.arc90.com/experiments/readability</span>" +
252
+ "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>" +
253
+ "</div>" +
254
+ "<div id='rdb-footer-right'>" +
255
+ "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>" +
256
+ "<span class='version'>Readability version " + readability.version + "</span>" +
257
+ "</div>";
258
+
259
+ // if (readStyle == ("style-athelas" || "style-apertura")) {
260
+ // console.log("Using Typekit Footer");
261
+ // getElementById("rdb-footer-logo").appendChild(twitterLink);
262
+ // }
263
+ // else {
264
+ // console.log("Using Normal Footer");
265
+ // articleFooter.getElementById("rdb-footer-right").appendChild(twitterLink);
266
+ // }
267
+
268
+ return articleFooter;
269
+ },
270
+
271
+ /**
272
+ * Prepare the HTML document for readability to scrape it.
273
+ * This includes things like stripping javascript, CSS, and handling terrible markup.
274
+ *
275
+ * @return void
276
+ **/
277
+ prepDocument: function () {
278
+ /**
279
+ * In some cases a body element can't be found (if the HTML is totally hosed for example)
280
+ * so we create a new body node and append it to the document.
281
+ */
282
+ if(document.body === null)
283
+ {
284
+ var body = document.createElement("body");
285
+ try {
286
+ document.body = body;
287
+ }
288
+ catch(e) {
289
+ document.documentElement.appendChild(body);
290
+ dbg(e);
291
+ }
292
+ }
293
+
294
+ var frames = document.getElementsByTagName('frame');
295
+ if(frames.length > 0)
296
+ {
297
+ var bestFrame = null;
298
+ var bestFrameSize = 0;
299
+ for(var frameIndex = 0; frameIndex < frames.length; frameIndex++)
300
+ {
301
+ var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
302
+ var canAccessFrame = false;
303
+ try {
304
+ frames[frameIndex].contentWindow.document.body;
305
+ canAccessFrame = true;
306
+ }
307
+ catch(eFrames) {
308
+ dbg(eFrames);
309
+ }
310
+
311
+ if(canAccessFrame && frameSize > bestFrameSize)
312
+ {
313
+ bestFrame = frames[frameIndex];
314
+ bestFrameSize = frameSize;
315
+ }
316
+ }
317
+
318
+ if(bestFrame)
319
+ {
320
+ var newBody = document.createElement('body');
321
+ newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
322
+ newBody.style.overflow = 'scroll';
323
+ document.body = newBody;
324
+
325
+ var frameset = document.getElementsByTagName('frameset')[0];
326
+ if(frameset) {
327
+ frameset.parentNode.removeChild(frameset); }
328
+
329
+ readability.frameHack = true;
330
+ }
331
+ }
332
+
333
+ /* remove all scripts that are not readability */
334
+ var scripts = document.getElementsByTagName('script');
335
+ for(var i = scripts.length-1; i >= 0; i--)
336
+ {
337
+ if(scripts[i].src == null || typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1))
338
+ {
339
+ scripts[i].parentNode.removeChild(scripts[i]);
340
+ }
341
+ }
342
+
343
+ /* remove all stylesheets */
344
+ for (var k=0;k < document.styleSheets.length; k++) {
345
+ if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") == -1) {
346
+ document.styleSheets[k].disabled = true;
347
+ }
348
+ }
349
+
350
+ /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
351
+ var styleTags = document.getElementsByTagName("style");
352
+ for (var st=0;st < styleTags.length; st++) {
353
+ if (navigator.appName != "Microsoft Internet Explorer") {
354
+ styleTags[st].textContent = ""; }
355
+ }
356
+
357
+ /* Turn all double br's into p's */
358
+ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
359
+ document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrsRe, '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
360
+ },
361
+
362
+ useRdbTypekit: function () {
363
+ var rdbHead = document.getElementsByTagName('head')[0];
364
+ var rdbTKScript = document.createElement('script');
365
+ var rdbTKCode = null;
366
+
367
+ var rdbTKLink = document.createElement('a');
368
+ rdbTKLink.setAttribute('class','rdbTK-powered');
369
+ rdbTKLink.setAttribute('title','Fonts by Typekit');
370
+ rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
371
+
372
+ if (readStyle == "style-athelas") {
373
+ rdbTKCode = "sxt6vzy";
374
+ dbg("Using Athelas Theme");
375
+
376
+ rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
377
+ rdbTKLink.setAttribute('id','rdb-athelas');
378
+ document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
379
+ }
380
+ if (readStyle == "style-apertura") {
381
+ rdbTKCode = "bae8ybu";
382
+ dbg("Using Inverse Theme");
383
+
384
+ rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
385
+ rdbTKLink.setAttribute('id','rdb-inverse');
386
+ document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
387
+ }
388
+
389
+ /**
390
+ * Setting new script tag attributes to pull Typekits libraries
391
+ **/
392
+ rdbTKScript.setAttribute('type','text/javascript');
393
+ rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
394
+ rdbTKScript.setAttribute('charset','UTF-8');
395
+ rdbHead.appendChild(rdbTKScript);
396
+
397
+ /**
398
+ * In the future, maybe try using the following experimental Callback function?:
399
+ * http://gist.github.com/192350
400
+ * &
401
+ * http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
402
+ **/
403
+ var typekitLoader = function() {
404
+ dbg("Looking for Typekit.");
405
+ if(typeof Typekit != "undefined") {
406
+ try {
407
+ dbg("Caught typekit");
408
+ Typekit.load();
409
+ clearInterval(window.typekitInterval);
410
+ } catch(e) {
411
+ dbg("Typekit error: " + e);
412
+ }
413
+ }
414
+ };
415
+
416
+ window.typekitInterval = window.setInterval(typekitLoader, 100);
417
+ },
418
+
419
+ /**
420
+ * Prepare the article node for display. Clean out any inline styles,
421
+ * iframes, forms, strip extraneous <p> tags, etc.
422
+ *
423
+ * @param Element
424
+ * @return void
425
+ **/
426
+ prepArticle: function (articleContent) {
427
+ readability.cleanStyles(articleContent);
428
+ readability.killBreaks(articleContent);
429
+
430
+ /* Clean out junk from the article content */
431
+ readability.clean(articleContent, "form");
432
+ readability.clean(articleContent, "object");
433
+ readability.clean(articleContent, "h1");
434
+ /**
435
+ * If there is only one h2, they are probably using it
436
+ * as a header and not a subheader, so remove it since we already have a header.
437
+ ***/
438
+ if(articleContent.getElementsByTagName('h2').length == 1) {
439
+ readability.clean(articleContent, "h2"); }
440
+ readability.clean(articleContent, "iframe");
441
+
442
+ readability.cleanHeaders(articleContent);
443
+
444
+ /* Do these last as the previous stuff may have removed junk that will affect these */
445
+ readability.cleanConditionally(articleContent, "table");
446
+ readability.cleanConditionally(articleContent, "ul");
447
+ readability.cleanConditionally(articleContent, "div");
448
+
449
+ /* Remove extra paragraphs */
450
+ var articleParagraphs = articleContent.getElementsByTagName('p');
451
+ for(var i = articleParagraphs.length-1; i >= 0; i--)
452
+ {
453
+ var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
454
+ var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
455
+ var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
456
+
457
+ if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '')
458
+ {
459
+ articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
460
+ }
461
+ }
462
+
463
+ try {
464
+ articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
465
+ }
466
+ catch (e) {
467
+ dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
468
+ }
469
+ },
470
+
471
+ /**
472
+ * Initialize a node with the readability object. Also checks the
473
+ * className/id for special names to add to its score.
474
+ *
475
+ * @param Element
476
+ * @return void
477
+ **/
478
+ initializeNode: function (node) {
479
+ node.readability = {"contentScore": 0};
480
+
481
+ switch(node.tagName) {
482
+ case 'DIV':
483
+ node.readability.contentScore += 5;
484
+ break;
485
+
486
+ case 'PRE':
487
+ case 'TD':
488
+ case 'BLOCKQUOTE':
489
+ node.readability.contentScore += 3;
490
+ break;
491
+
492
+ case 'ADDRESS':
493
+ case 'OL':
494
+ case 'UL':
495
+ case 'DL':
496
+ case 'DD':
497
+ case 'DT':
498
+ case 'LI':
499
+ case 'FORM':
500
+ node.readability.contentScore -= 3;
501
+ break;
502
+
503
+ case 'H1':
504
+ case 'H2':
505
+ case 'H3':
506
+ case 'H4':
507
+ case 'H5':
508
+ case 'H6':
509
+ case 'TH':
510
+ node.readability.contentScore -= 5;
511
+ break;
512
+ }
513
+
514
+ node.readability.contentScore += readability.getClassWeight(node);
515
+ },
516
+
517
+ /***
518
+ * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
519
+ * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
520
+ *
521
+ * @return Element
522
+ **/
523
+ grabArticle: function () {
524
+ var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS);
525
+
526
+ /**
527
+ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
528
+ * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
529
+ *
530
+ * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
531
+ * TODO: Shouldn't this be a reverse traversal?
532
+ **/
533
+ var node = null;
534
+ var nodesToScore = [];
535
+ for(var nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++)
536
+ {
537
+ /* Remove unlikely candidates */
538
+ if (stripUnlikelyCandidates) {
539
+ var unlikelyMatchString = node.className + node.id;
540
+ if (unlikelyMatchString.search(readability.regexps.unlikelyCandidatesRe) !== -1 &&
541
+ unlikelyMatchString.search(readability.regexps.okMaybeItsACandidateRe) == -1 &&
542
+ node.tagName !== "BODY")
543
+ {
544
+ dbg("Removing unlikely candidate - " + unlikelyMatchString);
545
+ node.parentNode.removeChild(node);
546
+ nodeIndex--;
547
+ continue;
548
+ }
549
+ }
550
+
551
+ if (node.tagName === "P" || node.tagName === "TD") {
552
+ nodesToScore[nodesToScore.length] = node;
553
+ }
554
+
555
+ /* Turn all divs that don't have children block level elements into p's */
556
+ if (node.tagName === "DIV") {
557
+ if (node.innerHTML.search(readability.regexps.divToPElementsRe) === -1) {
558
+ dbg("Altering div to p");
559
+ var newNode = document.createElement('p');
560
+ try {
561
+ newNode.innerHTML = node.innerHTML;
562
+ node.parentNode.replaceChild(newNode, node);
563
+ nodeIndex--;
564
+ }
565
+ catch(e) {
566
+ dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
567
+ }
568
+ }
569
+ else
570
+ {
571
+ /* EXPERIMENTAL */
572
+ for(var i = 0, il = node.childNodes.length; i < il; i++) {
573
+ var childNode = node.childNodes[i];
574
+ if(childNode.nodeType == 3) { // Node.TEXT_NODE
575
+ dbg("replacing text node with a p tag with the same content.");
576
+ var p = document.createElement('p');
577
+ p.innerHTML = childNode.nodeValue;
578
+ p.style.display = 'inline';
579
+ p.className = 'readability-styled';
580
+ childNode.parentNode.replaceChild(p, childNode);
581
+ }
582
+ }
583
+ }
584
+ }
585
+ }
586
+
587
+ /**
588
+ * Loop through all paragraphs, and assign a score to them based on how content-y they look.
589
+ * Then add their score to their parent node.
590
+ *
591
+ * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
592
+ **/
593
+ var candidates = [];
594
+ for (var pt=0; pt < nodesToScore.length; pt++) {
595
+ var parentNode = nodesToScore[pt].parentNode;
596
+ var grandParentNode = parentNode.parentNode;
597
+ var innerText = readability.getInnerText(nodesToScore[pt]);
598
+
599
+ /* If this paragraph is less than 25 characters, don't even count it. */
600
+ if(innerText.length < 25) {
601
+ continue; }
602
+
603
+ /* Initialize readability data for the parent. */
604
+ if(typeof parentNode.readability == 'undefined')
605
+ {
606
+ readability.initializeNode(parentNode);
607
+ candidates.push(parentNode);
608
+ }
609
+
610
+ /* Initialize readability data for the grandparent. */
611
+ if(typeof grandParentNode.readability == 'undefined')
612
+ {
613
+ readability.initializeNode(grandParentNode);
614
+ candidates.push(grandParentNode);
615
+ }
616
+
617
+ var contentScore = 0;
618
+
619
+ /* Add a point for the paragraph itself as a base. */
620
+ contentScore++;
621
+
622
+ /* Add points for any commas within this paragraph */
623
+ contentScore += innerText.split(',').length;
624
+
625
+ /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
626
+ contentScore += Math.min(Math.floor(innerText.length / 100), 3);
627
+
628
+ /* Add the score to the parent. The grandparent gets half. */
629
+ parentNode.readability.contentScore += contentScore;
630
+ grandParentNode.readability.contentScore += contentScore/2;
631
+ }
632
+
633
+ /**
634
+ * After we've calculated scores, loop through all of the possible candidate nodes we found
635
+ * and find the one with the highest score.
636
+ **/
637
+ var topCandidate = null;
638
+ for(var c=0, cl=candidates.length; c < cl; c++)
639
+ {
640
+ /**
641
+ * Scale the final candidates score based on link density. Good content should have a
642
+ * relatively small link density (5% or less) and be mostly unaffected by this operation.
643
+ **/
644
+ candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
645
+
646
+ dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
647
+
648
+ if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
649
+ topCandidate = candidates[c]; }
650
+ }
651
+
652
+ /**
653
+ * If we still have no top candidate, just use the body as a last resort.
654
+ * We also have to copy the body node so it is something we can modify.
655
+ **/
656
+ if (topCandidate === null || topCandidate.tagName == "BODY")
657
+ {
658
+ topCandidate = document.createElement("DIV");
659
+ topCandidate.innerHTML = document.body.innerHTML;
660
+ document.body.innerHTML = "";
661
+ document.body.appendChild(topCandidate);
662
+ readability.initializeNode(topCandidate);
663
+ }
664
+
665
+
666
+ /**
667
+ * Now that we have the top candidate, look through its siblings for content that might also be related.
668
+ * Things like preambles, content split by ads that we removed, etc.
669
+ **/
670
+ var articleContent = document.createElement("DIV");
671
+ articleContent.id = "readability-content";
672
+ var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
673
+ var siblingNodes = topCandidate.parentNode.childNodes;
674
+ for(var s=0, sl=siblingNodes.length; s < sl; s++)
675
+ {
676
+ var siblingNode = siblingNodes[s];
677
+ var append = false;
678
+
679
+ dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
680
+ dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
681
+
682
+ if(siblingNode === topCandidate)
683
+ {
684
+ append = true;
685
+ }
686
+
687
+ var contentBonus = 0;
688
+ /* Give a small bonus if sibling nodes and top candidates have the example same classname */
689
+ if(siblingNode.className == topCandidate.className && topCandidate.className != "") {
690
+ contentBonus += 10;
691
+ }
692
+
693
+ if(typeof siblingNode.readability != 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
694
+ {
695
+ append = true;
696
+ }
697
+
698
+ if(siblingNode.nodeName == "P") {
699
+ var linkDensity = readability.getLinkDensity(siblingNode);
700
+ var nodeContent = readability.getInnerText(siblingNode);
701
+ var nodeLength = nodeContent.length;
702
+
703
+ if(nodeLength > 80 && linkDensity < 0.25)
704
+ {
705
+ append = true;
706
+ }
707
+ else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
708
+ {
709
+ append = true;
710
+ }
711
+ }
712
+
713
+ if(append)
714
+ {
715
+ dbg("Appending node: " + siblingNode);
716
+
717
+ var nodeToAppend = null;
718
+ if(siblingNode.nodeName != "DIV" && siblingNode.nodeName != "P") {
719
+ /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
720
+
721
+ dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
722
+ nodeToAppend = document.createElement('div');
723
+ try {
724
+ nodeToAppend.id = siblingNode.id;
725
+ nodeToAppend.innerHTML = siblingNode.innerHTML;
726
+ }
727
+ catch(e)
728
+ {
729
+ dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
730
+ nodeToAppend = siblingNode;
731
+ s--;
732
+ sl--;
733
+ }
734
+ } else {
735
+ nodeToAppend = siblingNode;
736
+ s--;
737
+ sl--;
738
+ }
739
+
740
+ /* To ensure a node does not interfere with readability styles, remove its classnames */
741
+ nodeToAppend.className = "";
742
+
743
+ /* Append sibling and subtract from our list because it removes the node when you append to another node */
744
+ articleContent.appendChild(nodeToAppend);
745
+ }
746
+ }
747
+
748
+ /**
749
+ * So we have all of the content that we need. Now we clean it up for presentation.
750
+ **/
751
+ readability.prepArticle(articleContent);
752
+
753
+ return articleContent;
754
+ },
755
+
756
+ /**
757
+ * Get the inner text of a node - cross browser compatibly.
758
+ * This also strips out any excess whitespace to be found.
759
+ *
760
+ * @param Element
761
+ * @return string
762
+ **/
763
+ getInnerText: function (e, normalizeSpaces) {
764
+ var textContent = "";
765
+
766
+ normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
767
+
768
+ textContent = e.innerText.replace( readability.regexps.trimRe, "" );
769
+
770
+ if(normalizeSpaces) {
771
+ return textContent.replace( readability.regexps.normalizeRe, " "); }
772
+ else {
773
+ return textContent; }
774
+ },
775
+
776
+ /**
777
+ * Get the number of times a string s appears in the node e.
778
+ *
779
+ * @param Element
780
+ * @param string - what to split on. Default is ","
781
+ * @return number (integer)
782
+ **/
783
+ getCharCount: function (e,s) {
784
+ s = s || ",";
785
+ return readability.getInnerText(e).split(s).length-1;
786
+ },
787
+
788
+ /**
789
+ * Remove the style attribute on every e and under.
790
+ * TODO: Test if getElementsByTagName(*) is faster.
791
+ *
792
+ * @param Element
793
+ * @return void
794
+ **/
795
+ cleanStyles: function (e) {
796
+ e = e || document;
797
+ var cur = e.firstChild;
798
+
799
+ if(!e) {
800
+ return; }
801
+
802
+ // Remove any root styles, if we're able.
803
+ if(typeof e.removeAttribute == 'function' && e.className != 'readability-styled') {
804
+ e.removeAttribute('style'); }
805
+
806
+ // Go until there are no more child nodes
807
+ while ( cur !== null ) {
808
+ if ( cur.nodeType == 1 ) {
809
+ // Remove style attribute(s) :
810
+ if(cur.className != "readability-styled") {
811
+ cur.removeAttribute("style");
812
+ }
813
+ readability.cleanStyles( cur );
814
+ }
815
+ cur = cur.nextSibling;
816
+ }
817
+ },
818
+
819
+ /**
820
+ * Get the density of links as a percentage of the content
821
+ * This is the amount of text that is inside a link divided by the total text in the node.
822
+ *
823
+ * @param Element
824
+ * @return number (float)
825
+ **/
826
+ getLinkDensity: function (e) {
827
+ var links = e.getElementsByTagName("a");
828
+ var textLength = readability.getInnerText(e).length;
829
+ var linkLength = 0;
830
+ for(var i=0, il=links.length; i<il;i++)
831
+ {
832
+ linkLength += readability.getInnerText(links[i]).length;
833
+ }
834
+
835
+ return linkLength / textLength;
836
+ },
837
+
838
+ /**
839
+ * Get an elements class/id weight. Uses regular expressions to tell if this
840
+ * element looks good or bad.
841
+ *
842
+ * @param Element
843
+ * @return number (Integer)
844
+ **/
845
+ getClassWeight: function (e) {
846
+ if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
847
+ return 0;
848
+ }
849
+
850
+ var weight = 0;
851
+
852
+ /* Look for a special classname */
853
+ if (e.className != "")
854
+ {
855
+ if(e.className.search(readability.regexps.negativeRe) !== -1) {
856
+ weight -= 25; }
857
+
858
+ if(e.className.search(readability.regexps.positiveRe) !== -1) {
859
+ weight += 25; }
860
+ }
861
+
862
+ /* Look for a special ID */
863
+ if (typeof(e.id) == 'string' && e.id != "")
864
+ {
865
+ if(e.id.search(readability.regexps.negativeRe) !== -1) {
866
+ weight -= 25; }
867
+
868
+ if(e.id.search(readability.regexps.positiveRe) !== -1) {
869
+ weight += 25; }
870
+ }
871
+
872
+ return weight;
873
+ },
874
+
875
+ /**
876
+ * Remove extraneous break tags from a node.
877
+ *
878
+ * @param Element
879
+ * @return void
880
+ **/
881
+ killBreaks: function (e) {
882
+ try {
883
+ e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,'<br />');
884
+ }
885
+ catch (eBreaks) {
886
+ dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
887
+ }
888
+ },
889
+
890
+ /**
891
+ * Clean a node of all elements of type "tag".
892
+ * (Unless it's a youtube/vimeo video. People love movies.)
893
+ *
894
+ * @param Element
895
+ * @param string tag to clean
896
+ * @return void
897
+ **/
898
+ clean: function (e, tag) {
899
+ var targetList = e.getElementsByTagName( tag );
900
+ var isEmbed = (tag == 'object' || tag == 'embed');
901
+
902
+ for (var y=targetList.length-1; y >= 0; y--) {
903
+ /* Allow youtube and vimeo videos through as people usually want to see those. */
904
+ if(isEmbed) {
905
+ var attributeValues = "";
906
+ for (var i=0, il=targetList[y].attributes.length; i < il; i++) {
907
+ attributeValues += targetList[y].attributes[i].value + '|';
908
+ }
909
+
910
+ /* First, check the elements attributes to see if any of them contain youtube or vimeo */
911
+ if (attributeValues.search(readability.regexps.videoRe) !== -1) {
912
+ continue;
913
+ }
914
+
915
+ /* Then check the elements inside this element for the same. */
916
+ if (targetList[y].innerHTML.search(readability.regexps.videoRe) !== -1) {
917
+ continue;
918
+ }
919
+
920
+ }
921
+
922
+ targetList[y].parentNode.removeChild(targetList[y]);
923
+ }
924
+ },
925
+
926
+ /**
927
+ * Clean an element of all tags of type "tag" if they look fishy.
928
+ * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
929
+ *
930
+ * @return void
931
+ **/
932
+ cleanConditionally: function (e, tag) {
933
+ var tagsList = e.getElementsByTagName(tag);
934
+ var curTagsLength = tagsList.length;
935
+
936
+ /**
937
+ * Gather counts for other typical elements embedded within.
938
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
939
+ *
940
+ * TODO: Consider taking into account original contentScore here.
941
+ **/
942
+ for (var i=curTagsLength-1; i >= 0; i--) {
943
+ var weight = readability.getClassWeight(tagsList[i]);
944
+ var contentScore = (typeof tagsList[i].readability != 'undefined') ? tagsList[i].readability.contentScore : 0;
945
+
946
+ dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
947
+
948
+ if(weight+contentScore < 0)
949
+ {
950
+ tagsList[i].parentNode.removeChild(tagsList[i]);
951
+ }
952
+ else if ( readability.getCharCount(tagsList[i],',') < 10) {
953
+ /**
954
+ * If there are not very many commas, and the number of
955
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
956
+ **/
957
+ var p = tagsList[i].getElementsByTagName("p").length;
958
+ var img = tagsList[i].getElementsByTagName("img").length;
959
+ var li = tagsList[i].getElementsByTagName("li").length-100;
960
+ var input = tagsList[i].getElementsByTagName("input").length;
961
+
962
+ var embedCount = 0;
963
+ var embeds = tagsList[i].getElementsByTagName("embed");
964
+ for(var ei=0,il=embeds.length; ei < il; ei++) {
965
+ if (embeds[ei].src.search(readability.regexps.videoRe) == -1) {
966
+ embedCount++;
967
+ }
968
+ }
969
+
970
+ var linkDensity = readability.getLinkDensity(tagsList[i]);
971
+ var contentLength = readability.getInnerText(tagsList[i]).length;
972
+ var toRemove = false;
973
+
974
+ if ( img > p ) {
975
+ toRemove = true;
976
+ } else if(li > p && tag != "ul" && tag != "ol") {
977
+ toRemove = true;
978
+ } else if( input > Math.floor(p/3) ) {
979
+ toRemove = true;
980
+ } else if(contentLength < 25 && (img === 0 || img > 2) ) {
981
+ toRemove = true;
982
+ } else if(weight < 25 && linkDensity > 0.2) {
983
+ toRemove = true;
984
+ } else if(weight >= 25 && linkDensity > 0.5) {
985
+ toRemove = true;
986
+ } else if((embedCount == 1 && contentLength < 75) || embedCount > 1) {
987
+ toRemove = true;
988
+ }
989
+
990
+ if(toRemove) {
991
+ tagsList[i].parentNode.removeChild(tagsList[i]);
992
+ }
993
+ }
994
+ }
995
+ },
996
+
997
+ /**
998
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
999
+ *
1000
+ * @param Element
1001
+ * @return void
1002
+ **/
1003
+ cleanHeaders: function (e) {
1004
+ for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
1005
+ var headers = e.getElementsByTagName('h' + headerIndex);
1006
+ for (var i=headers.length-1; i >=0; i--) {
1007
+ if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1008
+ headers[i].parentNode.removeChild(headers[i]);
1009
+ }
1010
+ }
1011
+ }
1012
+ },
1013
+
1014
+ /**
1015
+ * Show the email popup.
1016
+ *
1017
+ * @return void
1018
+ **/
1019
+ emailBox: function () {
1020
+ var emailContainerExists = document.getElementById('email-container');
1021
+ if(null !== emailContainerExists)
1022
+ {
1023
+ return;
1024
+ }
1025
+
1026
+ var emailContainer = document.createElement('div');
1027
+ emailContainer.setAttribute('id', 'email-container');
1028
+ emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+escape(window.location)+'&pageTitle='+escape(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
1029
+
1030
+ document.body.appendChild(emailContainer);
1031
+ },
1032
+
1033
+ /**
1034
+ * Close the email popup. This is a hacktackular way to check if we're in a "close loop".
1035
+ * Since we don't have crossdomain access to the frame, we can only know when it has
1036
+ * loaded again. If it's loaded over 3 times, we know to close the frame.
1037
+ *
1038
+ * @return void
1039
+ **/
1040
+ removeFrame: function () {
1041
+ readability.iframeLoads++;
1042
+ if (readability.iframeLoads > 3)
1043
+ {
1044
+ var emailContainer = document.getElementById('email-container');
1045
+ if (null !== emailContainer) {
1046
+ emailContainer.parentNode.removeChild(emailContainer);
1047
+ }
1048
+
1049
+ readability.iframeLoads = 0;
1050
+ }
1051
+ },
1052
+
1053
+ htmlspecialchars: function (s) {
1054
+ if (typeof(s) == "string") {
1055
+ s = s.replace(/&/g, "&amp;");
1056
+ s = s.replace(/"/g, "&quot;");
1057
+ s = s.replace(/'/g, "&#039;");
1058
+ s = s.replace(/</g, "&lt;");
1059
+ s = s.replace(/>/g, "&gt;");
1060
+ }
1061
+
1062
+ return s;
1063
+ },
1064
+
1065
+ flagIsActive: function(flag) {
1066
+ return (readability.flags & flag) > 0;
1067
+ },
1068
+
1069
+ addFlag: function(flag) {
1070
+ readability.flags = readability.flags | flag;
1071
+ },
1072
+
1073
+ removeFlag: function(flag) {
1074
+ readability.flags = readability.flags & ~flag;
1075
+ }
1076
+
1077
+ };
1078
+
1079
+ readability.init();