distillery 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://rubygems.org"
2
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,5 @@
1
+ guard 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb})
3
+ watch(%r{^lib/(.+)\.rb}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Jeff Pollard
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Distillery
2
+
3
+ Distillery extracts the "content" portion out of an HTML document. It applies heuristics based on element type, location, class/id name and other attributes to try and find the content part of the HTML document and return it.
4
+
5
+ The logic for Distillery was heavily influenced by [Readability](https://www.readability.com/), who was nice enough to make [their logic](http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js) open source. Distillery does *not* aim to be a direct port of that logic. See [iterationlabs/ruby-readability](https://github.com/iterationlabs/ruby-readability) for something closer to that.
6
+
7
+ Readability and Distillery share nearly the same logic for locating the content HTML element on the page. Readability, however, also aggressively cleans and transforms the content element HTML to be used for display in a reading environment. Distillery aims to clean slightly less aggressively, and allow the user of the gem to choose how (and if) they would like to clean content element HTML.
8
+
9
+ ## Installation
10
+
11
+ gem install distillery
12
+
13
+ ## Usage
14
+
15
+ Usage is quite simple:
16
+
17
+ Distillery.distill(html_doc_as_a_string)
18
+ > "distilled content"
19
+
20
+ If you would like a more OO oriented syntax, Distillery offers a `Distillery::Document` API. Like the `distill` method above, its constructor takes a string that is the content of the HTML page you would like to distill:
21
+
22
+ doc = Distillery::Document.new(string_of_html)
23
+
24
+ Then you simply call `#distill!` on the document object to distill it and return the distilled content.
25
+
26
+ doc.distill!
27
+ > "distilled content"
28
+
29
+ Both the `Distill::Document#distill!` and `Distillery.distill` methods by default will clean the HTML of the content to remove elements from it which are unlikely to be the actual content. Usually, this is things like social media share buttons, widgets, advertisements, etc. If you would like to not clean the content, simply pass `:dirty => true` to either method:
30
+
31
+ doc.distill!(:dirty => true)
32
+ > "raw distilled content"
33
+
34
+ ## From the command line
35
+
36
+ Distillery also ships with an executable that allows you to distill documents at the command line:
37
+
38
+ Usage: distill [options] http://www.example.com/
39
+ -d, --dirty Do not clean content HTML
40
+ -v, --version Print the version
41
+ -h, --help Print this help message
data/Rakefile ADDED
@@ -0,0 +1,40 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |t|
6
+ t.rspec_opts = %w[--profile]
7
+ t.pattern = 'spec/**/*_spec.rb'
8
+ end
9
+
10
+ require "distillery"
11
+
12
+ def doc_for_fixture(fixture)
13
+ file = File.join(File.dirname(__FILE__), 'spec', 'fixtures', fixture)
14
+ Distillery::Document.new(File.open(file).read)
15
+ end
16
+
17
+ namespace :fixture do
18
+ desc 'Open the fixture with data-score elements added showing an elements score'
19
+ task :score, :filename do |t, args|
20
+ doc = doc_for_fixture(args[:filename])
21
+
22
+ doc.prep_for_distillation
23
+ doc.scores.each do |xpath, score|
24
+ doc.at(xpath)['data-score'] = score.to_s
25
+ end
26
+
27
+ outfile = File.open("/tmp/scored.#{args[:filename]}", 'w')
28
+ outfile << doc.to_s
29
+ sh "open #{outfile.path}"
30
+ end
31
+
32
+ desc 'Distill a fixture and open it'
33
+ task :distill, :filename do |t, args|
34
+ outfile = File.open("/tmp/distilled.#{args[:filename]}", 'w')
35
+ outfile << doc_for_fixture(args[:filename]).distill!
36
+ sh "open #{outfile.path}"
37
+ end
38
+ end
39
+
40
+ task :default => :spec
data/TODO ADDED
@@ -0,0 +1,5 @@
1
+ - Give users the possibility of preserving the HTML of the content element as it was seen.
2
+ - Instead of a string, return a Node from Nokogiri
3
+ - Remove HTMl comments from output
4
+ - Convert newline breaks to paragraphs
5
+ - Convert text nodes to <p> as well
data/bin/distill ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << File.dirname(__FILE__) + '/../lib/'
4
+
5
+ require 'open-uri'
6
+ require 'distillery'
7
+ require 'slop'
8
+
9
+ opts = Slop.parse :help => true do
10
+
11
+ on :d, :dirty, 'Do not clean content HTML', default: false
12
+ on :v, :version, 'Print the version' do
13
+ puts Distillery::VERSION
14
+ exit
15
+ end
16
+
17
+ banner "Usage: distill [options] http://www.example.com/"
18
+ end
19
+
20
+ unless ARGV.last =~ /^http/
21
+ puts opts.help
22
+ else
23
+ puts Distillery.distill(open(ARGV.last).read, :clean => !opts.dirty?)
24
+ end
@@ -0,0 +1,31 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "distillery/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "distillery"
7
+ s.version = Distillery::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Jeff Pollard"]
10
+ s.email = ["jeff.pollard@gmail.com"]
11
+ s.homepage = "https://github.com/Fluxx/distillery"
12
+ s.summary = %q{Extract the content portion of an HTML document.}
13
+ s.description = %q{Distillery extracts the "content" portion out of an HTML document. It applies heuristics based on element type, location, class/id name and other attributes to try and find the content part of the HTML document and return it.}
14
+
15
+ s.rubyforge_project = "distillery"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency('nokogiri', '> 1.0')
23
+ s.add_dependency('slop', '> 1.0')
24
+
25
+ s.add_development_dependency('rspec', '> 2.0')
26
+ s.add_development_dependency('guard')
27
+ s.add_development_dependency('guard-rspec')
28
+ s.add_development_dependency('ruby-debug19')
29
+ s.add_development_dependency('rb-fsevent')
30
+ s.add_development_dependency('growl')
31
+ end
data/lib/distillery.rb ADDED
@@ -0,0 +1,15 @@
1
+ require "distillery/document"
2
+ require "distillery/version"
3
+
4
+ module Distillery
5
+ ROOT = File.dirname(__FILE__)
6
+
7
+ # Distills the HTMl document string to just the conent portion.
8
+ #
9
+ # @param [String] str The HTML document to distill as a string.
10
+ # @param [Hash] options Distillation options
11
+ # @option options [Symbol] :dirty Do not clean the content element HTML
12
+ def self.distill(str, options = {})
13
+ Document.new(str).distill!(options)
14
+ end
15
+ end
@@ -0,0 +1,181 @@
1
+ require "delegate"
2
+ require "nokogiri"
3
+
4
+ module Distillery
5
+
6
+ # Wraps a Nokogiri document for the HTML page to be disilled and holds all methods to
7
+ # clean and distill the document down to just its content element.
8
+ class Document < SimpleDelegator
9
+
10
+ # HTML elements unlikely to contain the content element.
11
+ UNLIKELY_TAGS = %w[head script link meta]
12
+
13
+ # HTML ids and classes that are unlikely to contain the content element.
14
+ UNLIKELY_IDENTIFIERS = /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i
15
+
16
+ # "Block" elements who signal its parent is less-likely to be the content element.
17
+ BLOCK_ELEMENTS = %w[a blockquote dl div img ol p pre table ul]
18
+
19
+ # HTML ids and classes that are positive signals of the content element.
20
+ POSITIVE_IDENTIFIERS = /article|body|content|entry|hentry|page|pagination|post|text/i
21
+
22
+ # HTML ids and classes that are negative signals of the content element.
23
+ NEGATIVE_IDENTIFIERS = /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i
24
+
25
+ # HTML elements that are unrelated to the content in the content element.
26
+ UNRELATED_ELEMENTS = %w[iframe form object]
27
+
28
+ # HTML elements that are possible unrelated to the content of the content HTML
29
+ # element.
30
+ POSSIBLE_UNRELATED_ELEMENTS = %w[table ul div]
31
+
32
+ # The Nokogiri document
33
+ attr_reader :doc
34
+
35
+ # Hash of xpath => content score of elements in this document
36
+ attr_reader :scores
37
+
38
+ # Create a new Document
39
+ #
40
+ # @param [String] str The HTML document to distill as a string.
41
+ def initialize(page_string)
42
+ @scores = Hash.new(0)
43
+ super(::Nokogiri::HTML(page_string))
44
+ end
45
+
46
+ # Removes irrelevent elements from the document. This is usually things like <script>,
47
+ # <link> and other page elements we don't care about
48
+ def remove_irrelevant_elements!(tags = UNLIKELY_TAGS)
49
+ search(*tags).each(&:remove)
50
+ end
51
+
52
+ # Removes unlikely elements from the document. These are elements who have classes
53
+ # that seem to indicate they are comments, headers, footers, nav, etc
54
+ def remove_unlikely_elements!
55
+ search('*').each do |element|
56
+ idclass = "#{element['class']}#{element['id']}"
57
+ element.remove if idclass =~ UNLIKELY_IDENTIFIERS && element.name != 'body'
58
+ end
59
+ end
60
+
61
+ # Corrects improper use of HTML tags by coerceing elements that are likely paragraphs
62
+ # to <p> tags
63
+ def coerce_elements_to_paragraphs!
64
+ search('div').each do |div|
65
+ div.name = "p" if has_no_block_children?(div) || has_only_empty_div_children?(div)
66
+ end
67
+ end
68
+
69
+ # Scores the document elements based on an algorithm to find elements which hold page
70
+ # content.
71
+ def score!
72
+ search('p').each do |paragraph|
73
+ points = 1
74
+ points += paragraph.text.split(',').length
75
+ points += [paragraph.text.length / 100, 3].min
76
+
77
+ scores[paragraph.path] = points
78
+ parent = paragraph.parent
79
+ scores[parent.path] += points
80
+ scores[parent.parent.path] += points.to_f/2
81
+ end
82
+
83
+ augment_scores_by_link_weight!
84
+ end
85
+
86
+ # Distills the document down to just its content.
87
+ #
88
+ # @param [Hash] options Distillation options
89
+ # @option options [Symbol] :dirty Do not clean the content element HTML
90
+ def distill!(options = {})
91
+ prep_for_distillation!
92
+ score!
93
+ clean_top_scoring_element! unless options.delete(:clean) == false
94
+
95
+ top_scoring_element.inner_html
96
+ end
97
+
98
+ # Attempts to clean the top scoring node from non-page content items, such as
99
+ # advertisements, widgets, etc
100
+ def clean_top_scoring_element!
101
+ top_scoring_element.search("*").each do |node|
102
+ node.remove if has_empty_text?(node)
103
+ end
104
+
105
+ top_scoring_element.search("*").each do |node|
106
+ if UNRELATED_ELEMENTS.include?(node.name) ||
107
+ (node.text.count(',') < 2 && unlikely_to_be_content?(node))
108
+ node.remove
109
+ end
110
+ end
111
+ end
112
+
113
+ # Prepares the document for distillation by removing irrelevant and unlikely elements,
114
+ # as well as corecomg some elements to paragraphs for scoring.
115
+ def prep_for_distillation!
116
+ remove_irrelevant_elements!
117
+ remove_unlikely_elements!
118
+ coerce_elements_to_paragraphs!
119
+ end
120
+
121
+ private
122
+
123
+ def augment_scores_by_link_weight!
124
+ scores.each do |xpath, points|
125
+ scores[xpath] = scores[xpath] * ( 1 - link_density(at(xpath)) )
126
+ end
127
+ end
128
+
129
+ def link_density(elem)
130
+ link_length = elem.search('a').reduce(0) { |total, e| total + e.text.length }
131
+ total_length = [elem.text.length, 1].max # Protect against dividing by 0
132
+ link_length.to_f / total_length.to_f
133
+ end
134
+
135
+ def top_scoring_element
136
+ winner = scores.sort_by { |xpath, score| score }.reverse.first
137
+ top_xpath, top_score = winner || ['/html/body', 1]
138
+ at(top_xpath)
139
+ end
140
+
141
+ def has_no_block_children?(elem)
142
+ elem.children.none? { |c| BLOCK_ELEMENTS.include?(c.name) }
143
+ end
144
+
145
+ def has_only_empty_div_children?(elem)
146
+ elem.search('div').all? { |subdiv| subdiv.text == "" }
147
+ end
148
+
149
+ def identifier_weight(elem)
150
+ {POSITIVE_IDENTIFIERS => 25, NEGATIVE_IDENTIFIERS => -25}.reduce(0) do |weight, pair|
151
+ regex, score = pair
152
+ (weight += score if "#{elem['class']}+#{elem['id']}" =~ regex) or weight
153
+ end
154
+ end
155
+
156
+ def has_empty_text?(elem)
157
+ elem.text.gsub(/\s/, '').empty? && elem.name != 'br'
158
+ end
159
+
160
+ def unlikely_to_be_content?(elem)
161
+ return false unless POSSIBLE_UNRELATED_ELEMENTS.include?(elem.name)
162
+
163
+ p = elem.search('p').length
164
+ img = elem.search('img').length
165
+ li = elem.search('li').length
166
+ input = elem.search('input').length
167
+ weight = identifier_weight(elem)
168
+ link_density = link_density(elem)
169
+
170
+ weight < 0 || # Terrible weight
171
+ elem.text.empty? || elem.text.length < 15 || # Empty text or too short text
172
+ img > p || # More images than paragraphs
173
+ li > p && !(elem.name =~ /ul|ol/) || # Has lots of list items
174
+ input > p / 3 || # Has a high % of inputs
175
+ elem.text.length < 25 && (img == 0 || img > 2) || # Short text + no/high img count
176
+ weight < 25 && link_density > 0.2 || # Weak content signal and moderate link density
177
+ weight >= 25 && link_density > 0.5 # Strong content signal and high link density
178
+ end
179
+
180
+ end
181
+ end
@@ -0,0 +1,3 @@
1
+ module Distillery
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,108 @@
1
+ require 'spec_helper'
2
+
3
+ def distillation_of(filename, &block)
4
+
5
+ describe "distillation of #{filename}" do
6
+
7
+ let(:fixture) do
8
+ File.read(File.join(File.dirname(__FILE__), 'fixtures', filename))
9
+ end
10
+
11
+ subject { Distillery::Document.new(fixture).distill! }
12
+
13
+ it 'should include the right elements' do
14
+ instance_eval(&block)
15
+ end
16
+ end
17
+ end
18
+
19
+ distillation_of 'agave_cookies.html' do
20
+ should =~ /AGAVE &amp; HONEY OATMEAL M&amp;M COOKIES/
21
+ should =~ /2 Tbsp lightly beaten egg/
22
+ should =~ /Recipe Source:/
23
+
24
+ should_not =~ /I am a HUGE fan of agave and cook/ # Post comment
25
+ should_not =~ /mnuEntertaining/ # ID of element in header
26
+ should_not =~ /Get Email Updates/ # Sidebar
27
+ should_not =~ /id="footer"/ # Footer
28
+ end
29
+
30
+ distillation_of 'clams_and_linguini.html' do
31
+ should =~ /<h2>Linguini with Clam Sauce Recipe<\/h2>/
32
+ should =~ /2 pounds small clams in the shell/
33
+ should =~ /completely evaporated./
34
+
35
+ should_not =~ /Licorice sounds interesting./ # Comment
36
+ should_not =~ /Bookmark this page using the following/ # Footer
37
+ should_not =~ /Google Search/ # Header
38
+ end
39
+
40
+ distillation_of 'beef_jerkey.html' do
41
+ should =~ /always had a weakness/
42
+ should =~ /2 pounds trimmed beef top round/
43
+ should =~ /Om nom nom nom/
44
+
45
+ should_not =~ /Leave a Reply/ # Footer
46
+ should_not =~ /EMAIL SUBSCRIPTION/ # Sidebar
47
+ should_not =~ /allthingssimpleblog.com\/feed\// # Header
48
+ end
49
+
50
+ distillation_of 'vanilla_pound_cake.html' do
51
+ should =~ /Tahitian bean for its floral notes/
52
+ should =~ /beat until light and fluffy/
53
+ should =~ /cake comes out clean/
54
+
55
+ should_not =~ /Pound cake is a classi/ # Comments
56
+ should_not =~ /Simple template. Powered by/ # Footer
57
+ should_not =~ /Conversions and Measurement Tips/ # Header
58
+ end
59
+
60
+ distillation_of 'clouds_shining_moment.html' do
61
+ should =~ /The Dueling Models of Cloud Computing/
62
+ should =~ /These kinds of failures don't expose the weaknesses/
63
+ should =~ /Dynamic DNS pointing to elastic load balancers/
64
+
65
+ should_not =~ /Razi Sharir/ # Comments
66
+ should_not =~ /All trademarks and registered/ # Footer
67
+ should_not =~ /Community Guidelines/ # Header
68
+ end
69
+
70
+ distillation_of 'game_blog.html' do
71
+ should =~ /Currently in my Plants vs Zombies clone/
72
+ should =~ /50% they start to show sign/
73
+ should =~ /can never get enough feedback./
74
+
75
+ should_not =~ /Tutorials/ # Header
76
+ should_not =~ /Java Project/ # Sidebar
77
+ should_not =~ /View all comments/ # Footer
78
+ end
79
+
80
+ distillation_of 'js_this_keyword.html' do
81
+ should =~ /keyword is ubiquitous yet misconceptions abound/
82
+ should =~ /in ECMAScript parlance these are/
83
+ should =~ /Annex C/
84
+
85
+ should_not =~ /11 RESPONSES TO UNDERSTANDING/ # Footer
86
+ should_not =~ /The JavaScript Comma Operator/ # Sidebar
87
+ should_not =~ /Auto-generating JavaScript Unit Test/ # Header
88
+ end
89
+
90
+ distillation_of 'nyt_social_media.html' do
91
+ should =~ /What happens if you bring together/
92
+ should =~ /shows a 2D bar-graph-like timeline/
93
+ should =~ /then to explore several links/
94
+
95
+ should_not =~ /ADD A COMMENT/ # Comments
96
+ should_not =~ /ABOUT 1,000 POSTS AGO/ # Sidebar
97
+ should_not =~ /iPhone Tracker: How your/ # Header
98
+ end
99
+
100
+ distillation_of 'ginger_cookies.html' do
101
+ should =~ /Ginger cookies are chilled/
102
+ should =~ /12 minutes/
103
+ should =~ /Makes about 4 dozen crispy/
104
+
105
+ should_not =~ /Sponsored Links/ # Sidebar
106
+ should_not =~ /User Reviews/ # Comments
107
+ should_not =~ /Free Southern Food Newsletter!/ # Header
108
+ end