ruby-readability-discourse 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b35bc58da2245b0dda58c335c4cfea4551128c01
4
+ data.tar.gz: ae3c1969f961e4a567dfe27a7fe76ae7d280b637
5
+ SHA512:
6
+ metadata.gz: d31aeecfd5bff5eaac11bc9d766997b8eb0a15d87d809ebe8619e73ddf7edbb2a59cb837fcc321c3db69271219a841a37e63949bc4a468d690c593fdea61dc78
7
+ data.tar.gz: 5938c50037f15006cc8be537f420e6d29f5ff0b6f3d0419a9debce85f6f7014566eba82b53653cdcf59e95f5ee955a7f4b73f880b6b0b723c3a4df503e89f27a
@@ -0,0 +1,10 @@
1
+ .DS_Store
2
+ .gem
3
+ .bundle
4
+ Gemfile.lock
5
+ pkg/*
6
+ .idea
7
+ .rvmrc
8
+ .ruby-gemset
9
+ .ruby-version
10
+ .yardoc/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --format s -c
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.1
4
+ - 2.0.0
5
+ - 1.9.3
6
+ script: "bundle exec rspec"
@@ -0,0 +1,3 @@
1
+ --readme README.markdown
2
+ -
3
+ LICENSE
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'fastimage', '~> 1.2.13'
4
+ gem 'rake'
5
+ gem 'guard'
6
+ gem 'guard-rspec'
7
+
8
+ group :test do
9
+ gem "fakeweb", "~> 1.3.0"
10
+ end
11
+
12
+ gemspec
@@ -0,0 +1,9 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
9
+
data/LICENSE ADDED
@@ -0,0 +1,202 @@
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
@@ -0,0 +1,107 @@
1
+ Ruby Readability
2
+ ================
3
+
4
+ Ruby Readability is a tool for extracting the primary readable content of a
5
+ webpage. It is a Ruby port of arc90's readability project.
6
+
7
+ Build Status
8
+ ------------
9
+
10
+ [![Build Status](https://travis-ci.org/cantino/ruby-readability.png)](https://travis-ci.org/cantino/ruby-readability)
11
+
12
+ Install
13
+ -------
14
+
15
+ Command line:
16
+
17
+ (sudo) gem install ruby-readability
18
+
19
+ Bundler:
20
+
21
+ gem "ruby-readability", :require => 'readability'
22
+
23
+
24
+ Example
25
+ -------
26
+
27
+ require 'rubygems'
28
+ require 'readability'
29
+ require 'open-uri'
30
+
31
+ source = open('http://lab.arc90.com/experiments/readability/').read
32
+ puts Readability::Document.new(source).content
33
+
34
+
35
+ Options
36
+ -------
37
+
38
+ You may provide options to `Readability::Document.new`, including:
39
+
40
+ * `:tags`: the base whitelist of tags to sanitize, defaults to `%w[div p]`;
41
+ * `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
42
+ removes `<p>` tags that contain only images;
43
+ * `:attributes`: whitelist of allowed attributes;
44
+ * `:debug`: provide debugging output, defaults false;
45
+ * `:encoding`: if the page is of a known encoding, you can specify it; if left
46
+ unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
47
+ to disable guessing, supply `:do_not_guess_encoding => true`;
48
+ * `:html_headers`: in Ruby 1.9.x these will be passed to the
49
+ `guess_html_encoding` gem to aid with guessing the HTML encoding;
50
+ * `:ignore_image_format`: for use with .images. For example:
51
+ `:ignore_image_format => ["gif", "png"]`;
52
+ * `:min_image_height`: set a minimum image height for `#images`;
53
+ * `:min_image_width`: set a minimum image width for `#images`.
54
+
55
+
56
+ Command Line Tool
57
+ -----------------
58
+
59
+ Readability comes with a command-line tool for experimentation in
60
+ `bin/readability`.
61
+
62
+ Usage: readability [options] URL
63
+ -d, --debug Show debug output
64
+ -i, --images Keep images and links
65
+ -h, --help Show this message
66
+
67
+
68
+ Images
69
+ ------
70
+
71
+ You can get a list of images in the content area with `Document#images`. This
72
+ feature requires that the `fastimage` gem be installed.
73
+
74
+ rbody = Readability::Document.new(body, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false)
75
+ rbody.images
76
+
77
+ Related Projects
78
+ ----------------
79
+
80
+ * [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
81
+
82
+ Potential Issues
83
+ ----------------
84
+
85
+ If you're on a Mac and are getting segmentation faults, see the discussion at
86
+ <https://github.com/sparklemotion/nokogiri/issues/404> and consider updating
87
+ your version of `libxml2`. Version 2.7.8 of `libxml2`, installed with `brew`,
88
+ worked for me:
89
+
90
+ gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
91
+
92
+ Or if you're using bundler and Rails 3, you can run this command to make
93
+ bundler always globally build `nokogiri` this way:
94
+
95
+ bundle config build.nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
96
+
97
+
98
+ License
99
+ -------
100
+
101
+ This code is under the Apache License 2.0. See <http://www.apache.org/licenses/LICENSE-2.0>.
102
+
103
+ Ruby port by cantino, starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
104
+
105
+
106
+ [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/cantino/ruby-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
107
+
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'optparse'
5
+ require File.dirname(__FILE__) + '/../lib/readability'
6
+
7
+ options = { :debug => false, :images => false }
8
+ options_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: #{File.basename($0)} [options] URL"
10
+
11
+ opts.on("-d", "--debug", "Show debug output") do |v|
12
+ options[:debug] = v
13
+ end
14
+
15
+ opts.on("-i", "--images", "Keep images and links") do |i|
16
+ options[:images] = i
17
+ end
18
+
19
+ opts.on_tail("-h", "--help", "Show this message") do
20
+ puts opts
21
+ exit
22
+ end
23
+ end
24
+ options_parser.parse!
25
+
26
+ if ARGV.length != 1
27
+ STDERR.puts options_parser
28
+ exit 1
29
+ end
30
+
31
+ text = open(ARGV.first).read
32
+ params = if options[:images]
33
+ { :tags => %w[div p img a],
34
+ :attributes => %w[src href],
35
+ :remove_empty_nodes => false,
36
+ :debug => options[:debug] }
37
+ else
38
+ { :debug => options[:debug] }
39
+ end
40
+
41
+ puts Readability::Document.new(text, params).content
@@ -0,0 +1,492 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+ require 'guess_html_encoding'
6
+
7
+ module Readability
8
+ class Document
9
+ DEFAULT_OPTIONS = {
10
+ :retry_length => 250,
11
+ :min_text_length => 25,
12
+ :remove_unlikely_candidates => true,
13
+ :weight_classes => true,
14
+ :clean_conditionally => true,
15
+ :remove_empty_nodes => true,
16
+ :min_image_width => 130,
17
+ :min_image_height => 80,
18
+ :ignore_image_format => [],
19
+ :blacklist => nil,
20
+ :whitelist => nil
21
+ }.freeze
22
+
23
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
24
+
25
+ def initialize(input, options = {})
26
+ @options = DEFAULT_OPTIONS.merge(options)
27
+ @input = input
28
+
29
+ if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
30
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
31
+ @options[:encoding] = @input.encoding.to_s
32
+ end
33
+
34
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
35
+ @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
36
+ @weight_classes = @options[:weight_classes]
37
+ @clean_conditionally = @options[:clean_conditionally]
38
+ @best_candidate_has_image = true
39
+ make_html
40
+ handle_exclusions!(@options[:whitelist], @options[:blacklist])
41
+ end
42
+
43
+ def prepare_candidates
44
+ @html.css("script, style").each { |i| i.remove }
45
+ remove_unlikely_candidates! if @remove_unlikely_candidates
46
+ transform_misused_divs_into_paragraphs!
47
+
48
+ @candidates = score_paragraphs(options[:min_text_length])
49
+ @best_candidate = select_best_candidate(@candidates)
50
+ end
51
+
52
+ def handle_exclusions!(whitelist, blacklist)
53
+ return unless whitelist || blacklist
54
+
55
+ if blacklist
56
+ elems = @html.css(blacklist)
57
+ if elems
58
+ elems.each do |e|
59
+ e.remove
60
+ end
61
+ end
62
+ end
63
+
64
+ if whitelist
65
+ elems = @html.css(whitelist).to_s
66
+
67
+ if body = @html.at_css('body')
68
+ body.css('*').each do |e|
69
+ e.remove
70
+ end
71
+ body.inner_html = elems
72
+ end
73
+ end
74
+
75
+ @input = @html.to_s
76
+ nil
77
+ end
78
+
79
+ def make_html(whitelist=nil, blacklist=nil)
80
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
81
+ # In case document has no body, such as from empty string or redirect
82
+ @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
83
+
84
+ # Remove html comment tags
85
+ @html.xpath('//comment()').each { |i| i.remove }
86
+ end
87
+
88
+ def images(content=nil, reload=false)
89
+ begin
90
+ require 'fastimage'
91
+ rescue LoadError
92
+ raise "Please install fastimage in order to use the #images feature."
93
+ end
94
+
95
+ @best_candidate_has_image = false if reload
96
+
97
+ prepare_candidates
98
+ list_images = []
99
+ tested_images = []
100
+ content = @best_candidate[:elem] unless reload
101
+
102
+ return list_images if content.nil?
103
+ elements = content.css("img").map(&:attributes)
104
+
105
+ elements.each do |element|
106
+ next unless element["src"]
107
+
108
+ url = element["src"].value
109
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
110
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
111
+
112
+ if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
113
+ image = get_image_size(url)
114
+ next unless image
115
+ else
116
+ image = {:width => width, :height => height}
117
+ end
118
+
119
+ image[:format] = File.extname(url).gsub(".", "")
120
+
121
+ if tested_images.include?(url)
122
+ debug("Image was tested: #{url}")
123
+ next
124
+ end
125
+
126
+ tested_images.push(url)
127
+ if image_meets_criteria?(image)
128
+ list_images << url
129
+ else
130
+ debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
131
+ end
132
+ end
133
+
134
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
135
+ end
136
+
137
+ def get_image_size(url)
138
+ w, h = FastImage.size(url)
139
+ raise "Couldn't get size." if w.nil? || h.nil?
140
+ {:width => w, :height => h}
141
+ rescue => e
142
+ debug("Image error: #{e}")
143
+ nil
144
+ end
145
+
146
+ def image_meets_criteria?(image)
147
+ return false if options[:ignore_image_format].include?(image[:format].downcase)
148
+ image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
149
+ end
150
+
151
+ REGEXES = {
152
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
153
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
154
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
155
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
156
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
157
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
158
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
159
+ :trimRe => /^\s+|\s+$/,
160
+ :normalizeRe => /\s{2,}/,
161
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
162
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
163
+ }
164
+
165
+ def title
166
+ title = @html.css("title").first
167
+ title ? title.text : nil
168
+ end
169
+
170
+ # Look through the @html document looking for the author
171
+ # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
172
+ # Returns nil if no author is detected
173
+ def author
174
+ # Let's grab this author:
175
+ # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
176
+ author_elements = @html.xpath('//meta[@name = "dc.creator"]')
177
+ unless author_elements.empty?
178
+ author_elements.each do |element|
179
+ return element['content'].strip if element['content']
180
+ end
181
+ end
182
+
183
+ # Now let's try to grab this
184
+ # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
185
+ # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
186
+ author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
187
+ unless author_elements.empty?
188
+ author_elements.each do |element|
189
+ return element.text.strip if element.text
190
+ end
191
+ end
192
+
193
+ # Now let's try to grab this
194
+ # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
195
+ # TODO: strip out the (rel)?
196
+ author_elements = @html.xpath('//a[@rel = "author"]')
197
+ unless author_elements.empty?
198
+ author_elements.each do |element|
199
+ return element.text.strip if element.text
200
+ end
201
+ end
202
+
203
+ author_elements = @html.xpath('//*[@id = "author"]')
204
+ unless author_elements.empty?
205
+ author_elements.each do |element|
206
+ return element.text.strip if element.text
207
+ end
208
+ end
209
+ end
210
+
211
+ def content(remove_unlikely_candidates = :default)
212
+ @remove_unlikely_candidates = false if remove_unlikely_candidates == false
213
+
214
+ prepare_candidates
215
+ article = get_article(@candidates, @best_candidate)
216
+
217
+ cleaned_article = sanitize(article, @candidates, options)
218
+ if article.text.strip.length < options[:retry_length]
219
+ if @remove_unlikely_candidates
220
+ @remove_unlikely_candidates = false
221
+ elsif @weight_classes
222
+ @weight_classes = false
223
+ elsif @clean_conditionally
224
+ @clean_conditionally = false
225
+ else
226
+ # nothing we can do
227
+ return cleaned_article
228
+ end
229
+
230
+ make_html
231
+ content
232
+ else
233
+ cleaned_article
234
+ end
235
+ end
236
+
237
+ def get_article(candidates, best_candidate)
238
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
239
+ # Things like preambles, content split by ads that we removed, etc.
240
+
241
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
242
+ output = Nokogiri::XML::Node.new('div', @html)
243
+ best_candidate[:elem].parent.children.each do |sibling|
244
+ append = false
245
+ append = true if sibling == best_candidate[:elem]
246
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
247
+
248
+ if sibling.name.downcase == "p"
249
+ link_density = get_link_density(sibling)
250
+ node_content = sibling.text
251
+ node_length = node_content.length
252
+
253
+ append = if node_length > 80 && link_density < 0.25
254
+ true
255
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
256
+ true
257
+ end
258
+ end
259
+
260
+ if append
261
+ sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
262
+ sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
263
+ output << sibling_dup
264
+ end
265
+ end
266
+
267
+ output
268
+ end
269
+
270
+ def select_best_candidate(candidates)
271
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
272
+
273
+ debug("Top 5 candidates:")
274
+ sorted_candidates[0...5].each do |candidate|
275
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
276
+ end
277
+
278
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
279
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
280
+
281
+ best_candidate
282
+ end
283
+
284
+ def get_link_density(elem)
285
+ link_length = elem.css("a").map(&:text).join("").length
286
+ text_length = elem.text.length
287
+ link_length / text_length.to_f
288
+ end
289
+
290
+ def score_paragraphs(min_text_length)
291
+ candidates = {}
292
+ @html.css("p,td").each do |elem|
293
+ parent_node = elem.parent
294
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
295
+ inner_text = elem.text
296
+
297
+ # If this paragraph is less than 25 characters, don't even count it.
298
+ next if inner_text.length < min_text_length
299
+
300
+ candidates[parent_node] ||= score_node(parent_node)
301
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
302
+
303
+ content_score = 1
304
+ content_score += inner_text.split(',').length
305
+ content_score += [(inner_text.length / 100).to_i, 3].min
306
+
307
+ candidates[parent_node][:content_score] += content_score
308
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
309
+ end
310
+
311
+ # Scale the final candidates score based on link density. Good content should have a
312
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
313
+ candidates.each do |elem, candidate|
314
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
315
+ end
316
+
317
+ candidates
318
+ end
319
+
320
+ def class_weight(e)
321
+ weight = 0
322
+ return weight unless @weight_classes
323
+
324
+ if e[:class] && e[:class] != ""
325
+ weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
326
+ weight += 25 if e[:class] =~ REGEXES[:positiveRe]
327
+ end
328
+
329
+ if e[:id] && e[:id] != ""
330
+ weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
331
+ weight += 25 if e[:id] =~ REGEXES[:positiveRe]
332
+ end
333
+
334
+ weight
335
+ end
336
+
337
+ ELEMENT_SCORES = {
338
+ 'div' => 5,
339
+ 'blockquote' => 3,
340
+ 'form' => -3,
341
+ 'th' => -5
342
+ }.freeze
343
+
344
+ def score_node(elem)
345
+ content_score = class_weight(elem)
346
+ content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
347
+ { :content_score => content_score, :elem => elem }
348
+ end
349
+
350
+ def debug(str)
351
+ puts str if options[:debug]
352
+ end
353
+
354
+ def remove_unlikely_candidates!
355
+ @html.css("*").each do |elem|
356
+ str = "#{elem[:class]}#{elem[:id]}"
357
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
358
+ debug("Removing unlikely candidate - #{str}")
359
+ elem.remove
360
+ end
361
+ end
362
+ end
363
+
364
+ def transform_misused_divs_into_paragraphs!
365
+ @html.css("*").each do |elem|
366
+ if elem.name.downcase == "div"
367
+ # transform <div>s that do not contain other block elements into <p>s
368
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
369
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
370
+ elem.name = "p"
371
+ end
372
+ else
373
+ # wrap text nodes in p tags
374
+ # elem.children.each do |child|
375
+ # if child.text?
376
+ # debug("wrapping text node with a p")
377
+ # child.swap("<p>#{child.text}</p>")
378
+ # end
379
+ # end
380
+ end
381
+ end
382
+ end
383
+
384
+ def sanitize(node, candidates, options = {})
385
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
386
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
387
+ end
388
+
389
+ node.css("form, object, iframe, embed").each do |elem|
390
+ elem.remove
391
+ end
392
+
393
+ if @options[:remove_empty_nodes]
394
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
395
+ node.css("p").each do |elem|
396
+ elem.remove if elem.content.strip.empty?
397
+ end
398
+ end
399
+
400
+ # Conditionally clean <table>s, <ul>s, and <div>s
401
+ clean_conditionally(node, candidates, "table, ul, div")
402
+
403
+ # We'll sanitize all elements using a whitelist
404
+ base_whitelist = @options[:tags] || %w[div p]
405
+ # We'll add whitespace instead of block elements,
406
+ # so a<br>b will have a nice space between them
407
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
408
+
409
+ # Use a hash for speed (don't want to make a million calls to include?)
410
+ whitelist = Hash.new
411
+ base_whitelist.each {|tag| whitelist[tag] = true }
412
+ replace_with_whitespace = Hash.new
413
+ base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
414
+
415
+ ([node] + node.css("*")).each do |el|
416
+ # If element is in whitelist, delete all its attributes
417
+ if whitelist[el.node_name]
418
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
419
+
420
+ # Otherwise, replace the element with its contents
421
+ else
422
+ # If element is root, replace the node as a text node
423
+ if el.parent.nil?
424
+ node = Nokogiri::XML::Text.new(el.text, el.document)
425
+ break
426
+ else
427
+ if replace_with_whitespace[el.node_name]
428
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
429
+ else
430
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
431
+ end
432
+ end
433
+ end
434
+
435
+ end
436
+
437
+ s = Nokogiri::XML::Node::SaveOptions
438
+ save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
439
+ html = node.serialize(:save_with => save_opts)
440
+
441
+ # Get rid of duplicate whitespace
442
+ return html.gsub(/[\r\n\f]+/, "\n" )
443
+ end
444
+
445
+ def clean_conditionally(node, candidates, selector)
446
+ return unless @clean_conditionally
447
+ node.css(selector).each do |el|
448
+ weight = class_weight(el)
449
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
450
+ name = el.name.downcase
451
+
452
+ if weight + content_score < 0
453
+ el.remove
454
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
455
+ elsif el.text.count(",") < 10
456
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
457
+ counts["li"] -= 100
458
+
459
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
460
+ link_density = get_link_density(el)
461
+
462
+ reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density)
463
+ if reason
464
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
465
+ el.remove
466
+ end
467
+ end
468
+ end
469
+ end
470
+
471
+ def clean_conditionally_reason?(counts, content_length, options, weight, link_density)
472
+ if counts["img"] > counts["p"]
473
+ "too many images"
474
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
475
+ "more <li>s than <p>s"
476
+ elsif counts["input"] > (counts["p"] / 3).to_i
477
+ "less than 3x <p>s than <input>s"
478
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
479
+ "too short a content length without a single image"
480
+ elsif weight < 25 && link_density > 0.2
481
+ "too many links for its weight (#{weight})"
482
+ elsif weight >= 25 && link_density > 0.5
483
+ "too many links for its weight (#{weight})"
484
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
485
+ "<embed>s with too short a content length, or too many <embed>s"
486
+ else
487
+ nil
488
+ end
489
+ end
490
+
491
+ end
492
+ end