pismo 0.7.2 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/LICENSE +1 -1
- data/NOTICE +1 -1
- data/README.markdown +20 -7
- data/Rakefile +0 -23
- data/lib/pismo.rb +6 -3
- data/lib/pismo/document.rb +8 -3
- data/lib/pismo/internal_attributes.rb +38 -6
- data/lib/pismo/reader.rb +10 -394
- data/lib/pismo/reader/base.rb +261 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/version.rb +1 -1
- data/pismo.gemspec +2 -3
- data/test/corpus/metadata_expected.yaml +8 -2
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/thegoodbookblog.html +612 -0
- data/test/helper.rb +3 -0
- data/test/test_corpus.rb +16 -3
- metadata +108 -111
- data/test/corpus/metadata_expected.yaml.old +0 -122
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3d4877c2e5c6a89e889252fd0a8d6fc6925a11ab
|
4
|
+
data.tar.gz: d20e4116a8483be2aeb2afe401e7594239f4ad07
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7d9f92027e982295d94a49005024f1fb0ffbe87cc6885e761274ecce4b541a8cbc078749305fc810be536a9d3cb9b711b9983c1d15cb74c7aeaafffcc258d007
|
7
|
+
data.tar.gz: 278729a65c1ff72a2ca7c9a62436183af4933bc6371334722aa5b50027779bfa673cf0161d17164e6090e2a948d659929e95b2951df9575b999de1b9a76f8a2c
|
data/.gitignore
CHANGED
data/LICENSE
CHANGED
data/NOTICE
CHANGED
data/README.markdown
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
## DESCRIPTION:
|
4
4
|
|
5
|
-
Pismo extracts machine-usable metadata from unstructured (or poorly structured) English-language HTML documents.
|
6
|
-
Data that Pismo can extract include titles, feed URLs, ledes, body text, image URLs, date, and keywords.
|
7
|
-
Pismo is used heavily in production on http://coder.io/ to extract data from Web pages.
|
5
|
+
Pismo extracts machine-usable metadata from unstructured (or poorly structured) English-language HTML documents. Data that Pismo can extract include titles, feed URLs, ledes, body text, image URLs, date, and keywords.
|
8
6
|
|
9
|
-
All tests pass on Ruby 1.
|
7
|
+
All tests pass on Ruby 1.9.3 and 2.0.0. Currently fails on JRuby 1.7.2 due to dependencies.
|
10
8
|
|
11
9
|
## NEWS:
|
12
10
|
|
13
|
-
|
11
|
+
February 27, 2013: Version 0.7.4 has been released to ensure Ruby 2.0.0 compatibility but significant pull requests remain yet to be merged and handled.
|
12
|
+
|
13
|
+
December 19, 2010: Version 0.7.2 has been released - it includes a patch from Darcy Laycock to fix keyword extraction problems on some pages, has switched from Jeweler to Bundler for management of the gem, and adds support for JRuby 1.5.6 by skipping stemming on that platform.
|
14
14
|
|
15
15
|
## USAGE:
|
16
16
|
|
@@ -49,7 +49,9 @@ The current metadata methods are:
|
|
49
49
|
|
50
50
|
These methods are not fully documented here yet - you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
|
51
51
|
|
52
|
-
The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader"
|
52
|
+
The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader". #body returns it as plain-text, #html_body maintains some basic HTML styling.
|
53
|
+
|
54
|
+
The default reader is the "tree" reader. This works in a similar fashion to Arc90's Readability or Safari Reader algorithm.
|
53
55
|
|
54
56
|
New! The keywords method accepts optional arguments. These are the current defaults:
|
55
57
|
|
@@ -100,6 +102,17 @@ You can access Pismo's stopword list directly:
|
|
100
102
|
|
101
103
|
Pismo.stopwords # => [.., .., ..]
|
102
104
|
|
105
|
+
### Alternate readers
|
106
|
+
|
107
|
+
Pismo supports different readers for extracting the #body and #html_body from the web page.
|
108
|
+
|
109
|
+
The "cluster" reader uses an algorithm that tries to cluster contiguous content blocks together to identify the main document body. This is based on the ExtractContent gem (http://rubyforge.org/projects/extractcontent/).
|
110
|
+
|
111
|
+
The reader can be specified as part of #Document.new :
|
112
|
+
|
113
|
+
doc = Document.new(url, :reader => :cluster)
|
114
|
+
|
115
|
+
|
103
116
|
## Note on Patches/Pull Requests
|
104
117
|
|
105
118
|
* Fork the project.
|
@@ -115,4 +128,4 @@ Copyright (c) 2009, 2010 Peter Cooper
|
|
115
128
|
|
116
129
|
In short, you can use Pismo for whatever you like commercial or not, but please include a brief credit (as in the NOTICE file - as per the Apache 2.0 License) somewhere deep in your license file or similar, and, if you're nice and have the time, let me know if you're using it and/or share any significant changes or improvements you make.
|
117
130
|
|
118
|
-
http://github.com/peterc/pismo
|
131
|
+
http://github.com/peterc/pismo
|
data/Rakefile
CHANGED
@@ -8,31 +8,8 @@ Rake::TestTask.new(:test) do |test|
|
|
8
8
|
test.verbose = true
|
9
9
|
end
|
10
10
|
|
11
|
-
begin
|
12
|
-
require 'rcov/rcovtask'
|
13
|
-
Rcov::RcovTask.new do |test|
|
14
|
-
test.libs << 'test'
|
15
|
-
test.pattern = 'test/**/test_*.rb'
|
16
|
-
test.verbose = true
|
17
|
-
end
|
18
|
-
rescue LoadError
|
19
|
-
task :rcov do
|
20
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
11
|
task :default => :test
|
25
12
|
|
26
|
-
require 'rake/rdoctask'
|
27
|
-
Rake::RDocTask.new do |rdoc|
|
28
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
29
|
-
|
30
|
-
rdoc.rdoc_dir = 'rdoc'
|
31
|
-
rdoc.title = "pismo #{version}"
|
32
|
-
rdoc.rdoc_files.include('README*')
|
33
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
34
|
-
end
|
35
|
-
|
36
13
|
desc 'Automatically run something when code is changed'
|
37
14
|
task :on_update do
|
38
15
|
require 'find'
|
data/lib/pismo.rb
CHANGED
@@ -9,6 +9,9 @@ require 'tempfile'
|
|
9
9
|
$: << File.dirname(__FILE__)
|
10
10
|
require 'pismo/document'
|
11
11
|
require 'pismo/reader'
|
12
|
+
require 'pismo/reader/base'
|
13
|
+
require 'pismo/reader/tree'
|
14
|
+
require 'pismo/reader/cluster'
|
12
15
|
|
13
16
|
if RUBY_PLATFORM == "java"
|
14
17
|
class String; def stem; self; end; end
|
@@ -18,8 +21,8 @@ end
|
|
18
21
|
|
19
22
|
module Pismo
|
20
23
|
# Sugar methods to make creating document objects nicer
|
21
|
-
def self.document(handle,
|
22
|
-
Document.new(handle,
|
24
|
+
def self.document(handle, options = {})
|
25
|
+
Document.new(handle, options)
|
23
26
|
end
|
24
27
|
|
25
28
|
# Load a URL, as with Pismo['http://www.rubyinside.com'], and caches the Pismo document
|
@@ -76,4 +79,4 @@ class Nokogiri::HTML::Document
|
|
76
79
|
end
|
77
80
|
all && !r.empty? ? r : nil
|
78
81
|
end
|
79
|
-
end
|
82
|
+
end
|
data/lib/pismo/document.rb
CHANGED
@@ -5,14 +5,16 @@ module Pismo
|
|
5
5
|
|
6
6
|
# Pismo::Document represents a single HTML document within Pismo
|
7
7
|
class Document
|
8
|
-
attr_reader :doc, :url
|
8
|
+
attr_reader :doc, :url, :options
|
9
9
|
|
10
10
|
ATTRIBUTE_METHODS = InternalAttributes.instance_methods + ExternalAttributes.instance_methods
|
11
11
|
|
12
12
|
include Pismo::InternalAttributes
|
13
13
|
include Pismo::ExternalAttributes
|
14
14
|
|
15
|
-
def initialize(handle,
|
15
|
+
def initialize(handle, options = {})
|
16
|
+
@options = options
|
17
|
+
url = @options.delete(:url)
|
16
18
|
load(handle, url)
|
17
19
|
end
|
18
20
|
|
@@ -55,7 +57,10 @@ module Pismo
|
|
55
57
|
html.gsub!('”', '"')
|
56
58
|
html.gsub!("…", '...')
|
57
59
|
html.gsub!(' ', ' ')
|
60
|
+
html.gsub!('<', '<')
|
61
|
+
html.gsub!('>', '>')
|
62
|
+
html.gsub!('&', '&')
|
58
63
|
html
|
59
64
|
end
|
60
65
|
end
|
61
|
-
end
|
66
|
+
end
|
@@ -76,7 +76,7 @@ module Pismo
|
|
76
76
|
def datetime
|
77
77
|
# TODO: Clean all this mess up
|
78
78
|
|
79
|
-
mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
|
79
|
+
mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\.?}i
|
80
80
|
|
81
81
|
regexen = [
|
82
82
|
/#{mo}\b\s+\d+\D{1,10}\d{4}/i,
|
@@ -86,6 +86,7 @@ module Pismo
|
|
86
86
|
/\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
|
87
87
|
/\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
|
88
88
|
/on\s+#{mo}\s+\d+/i,
|
89
|
+
/#{mo}\s+\d+,? \d{4}+/i,
|
89
90
|
/#{mo}\s+\d+/i,
|
90
91
|
/\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
|
91
92
|
/\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
|
@@ -102,7 +103,8 @@ module Pismo
|
|
102
103
|
# Clean up the string for use by Chronic
|
103
104
|
datetime.strip!
|
104
105
|
datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
|
105
|
-
datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
|
106
|
+
datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)\.?[^\w]*/i, '')
|
107
|
+
datetime.gsub!(/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\./i, '\1')
|
106
108
|
datetime.sub!(/on\s+/, '')
|
107
109
|
datetime.gsub!(/\,/, '')
|
108
110
|
datetime.sub!(/(\d+)(th|st|rd)/, '\1')
|
@@ -230,8 +232,38 @@ module Pismo
|
|
230
232
|
def images(limit = 3)
|
231
233
|
reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
|
232
234
|
end
|
233
|
-
|
234
|
-
# Returns the
|
235
|
+
|
236
|
+
# Returns the tags or categories of the page/content
|
237
|
+
def tags
|
238
|
+
css_selectors = [
|
239
|
+
'.watch-info-tag-list a', # YouTube
|
240
|
+
'.entry .tags a', # Livejournal
|
241
|
+
'a[rel~=tag]', # Wordpress and many others
|
242
|
+
'a.tag', # Tumblr
|
243
|
+
'.tags a',
|
244
|
+
'.labels a',
|
245
|
+
'.categories a',
|
246
|
+
'.topics a'
|
247
|
+
]
|
248
|
+
|
249
|
+
tags = []
|
250
|
+
|
251
|
+
# grab the first one we get results from
|
252
|
+
css_selectors.each do |css_selector|
|
253
|
+
tags += @doc.css(css_selector)
|
254
|
+
break if tags.any?
|
255
|
+
end
|
256
|
+
|
257
|
+
# convert from Nokogiri Element objects to strings
|
258
|
+
tags.map!(&:inner_text)
|
259
|
+
|
260
|
+
# remove "#" from hashtag-like tags
|
261
|
+
tags.map! { |t| t.gsub(/^#/, '') }
|
262
|
+
|
263
|
+
tags
|
264
|
+
end
|
265
|
+
|
266
|
+
# Returns the "keywords" in the document (not the meta 'ss'keywords - they're next to useless now)
|
235
267
|
def keywords(options = {})
|
236
268
|
options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
|
237
269
|
|
@@ -261,7 +293,7 @@ module Pismo
|
|
261
293
|
end
|
262
294
|
|
263
295
|
def reader_doc
|
264
|
-
@reader_doc ||= Reader::Document.
|
296
|
+
@reader_doc ||= Reader::Document.create(@doc.to_s, @options)
|
265
297
|
end
|
266
298
|
|
267
299
|
# Returns body text as determined by Reader algorithm
|
@@ -312,4 +344,4 @@ module Pismo
|
|
312
344
|
feed(true)
|
313
345
|
end
|
314
346
|
end
|
315
|
-
end
|
347
|
+
end
|
data/lib/pismo/reader.rb
CHANGED
@@ -1,403 +1,19 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'sanitize'
|
3
|
-
begin; require 'ap'; rescue LoadError; end
|
4
|
-
|
5
1
|
module Pismo
|
6
2
|
module Reader
|
7
3
|
class Document
|
8
|
-
attr_reader :raw_content, :doc, :content_candidates
|
9
|
-
|
10
|
-
# Elements to keep for /input/ sanitization
|
11
|
-
OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
|
12
|
-
|
13
|
-
# Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
|
14
|
-
OK_ATTRIBUTES = {}
|
15
|
-
OK_CLEAN_ATTRIBUTES = {}
|
16
|
-
OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
|
17
|
-
OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
|
18
|
-
|
19
|
-
|
20
|
-
# Words that we'd like to see in class and ID names for "content"
|
21
|
-
GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
|
22
|
-
|
23
|
-
# Words that indicate crap in general
|
24
|
-
BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
|
25
|
-
|
26
|
-
# Words that kill a branch dead
|
27
|
-
FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
|
28
|
-
|
29
|
-
META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
|
30
|
-
|
31
|
-
WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
|
32
|
-
COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
|
33
|
-
|
34
|
-
## Output sanitization element sets
|
35
|
-
BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
|
36
|
-
INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
|
37
|
-
OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
|
38
|
-
NON_HEADER_ELEMENTS = %w{p br}
|
39
|
-
|
40
|
-
# Create a document object based on the raw HTML content provided
|
41
|
-
def initialize(raw_content)
|
42
|
-
@raw_content = Pismo::Document.clean_html(raw_content)
|
43
|
-
build_doc
|
44
|
-
end
|
45
|
-
|
46
|
-
def build_doc
|
47
|
-
@content = {}
|
48
|
-
|
49
|
-
if RUBY_VERSION > "1.9"
|
50
|
-
@raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
|
51
|
-
@raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
|
52
|
-
end
|
53
|
-
|
54
|
-
# Normalize whitespace (as much to make debugging sessions look nice as anything else)
|
55
|
-
@raw_content.gsub!(/\s{2,}/, ' ')
|
56
|
-
@raw_content.gsub!(/\r/, "\n")
|
57
|
-
@raw_content.gsub!(/\n{3,}/, "\n\n")
|
58
|
-
@raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
|
59
|
-
|
60
|
-
# Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
|
61
|
-
@raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
|
62
|
-
|
63
|
-
# Get rid of bullshit "smart" quotes and other Unicode nonsense
|
64
|
-
@raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
|
65
|
-
@raw_content.gsub!("\xe2\x80\x89", " ")
|
66
|
-
@raw_content.gsub!("\xe2\x80\x99", "'")
|
67
|
-
@raw_content.gsub!("\xe2\x80\x98", "'")
|
68
|
-
@raw_content.gsub!("\xe2\x80\x9c", '"')
|
69
|
-
@raw_content.gsub!("\xe2\x80\x9d", '"')
|
70
|
-
@raw_content.gsub!("\xe2\x80\xf6", '.')
|
71
|
-
@raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
|
72
|
-
|
73
|
-
|
74
|
-
# Sanitize the HTML
|
75
|
-
@raw_content = Sanitize.clean(@raw_content,
|
76
|
-
:elements => OK_ELEMENTS,
|
77
|
-
:attributes => OK_ATTRIBUTES,
|
78
|
-
:remove_contents => true,
|
79
|
-
:output_encoding => 'utf-8'
|
80
|
-
)
|
81
|
-
|
82
|
-
@doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
|
83
|
-
|
84
|
-
build_analysis_tree
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
# Analyze the structure of the HTML document and score branches for likelihood of containing useful content
|
89
|
-
def build_analysis_tree
|
90
|
-
@tree = {}
|
91
|
-
subels = {}
|
92
|
-
|
93
|
-
t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
94
|
-
|
95
|
-
# Do a pre clean up of elements.
|
96
|
-
@doc.css("div, span, table, tr, td, pre").each do |el|
|
97
|
-
# Any block elements with no child block elements can become paragraphs
|
98
|
-
if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
|
99
|
-
el.name = "p"
|
100
|
-
elsif el.name != "span"
|
101
|
-
el.name = "div"
|
102
|
-
end
|
103
|
-
|
104
|
-
# Any SPANs that aren't within paragraphs can become paragraphs too
|
105
|
-
el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
|
106
|
-
|
107
|
-
el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
|
108
|
-
end
|
109
|
-
|
110
|
-
@doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
|
111
|
-
# Assume that no content we'll want comes in a total package of fewer than 80 characters!
|
112
|
-
next unless el.text.to_s.strip.length >= 80
|
113
|
-
|
114
|
-
path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
|
115
|
-
depth = path_segments.length
|
116
4
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
(
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
#puts "IDS"
|
127
|
-
#ap ids
|
128
|
-
#puts "LOCAL IDS"
|
129
|
-
#ap local_ids
|
130
|
-
|
131
|
-
branch = {}
|
132
|
-
branch[:ids] = ids
|
133
|
-
branch[:local_ids] = local_ids
|
134
|
-
branch[:score] = -(BAD_WORDS & ids).size
|
135
|
-
branch[:score] += ((GOOD_WORDS & ids).size * 2)
|
136
|
-
next if branch[:score] < -5
|
137
|
-
|
138
|
-
#puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
|
139
|
-
|
140
|
-
# Elements that have an ID or class are more likely to be our winners
|
141
|
-
branch[:score] += 2 unless local_ids.empty?
|
142
|
-
|
143
|
-
branch[:name] = el.name
|
144
|
-
branch[:depth] = depth
|
145
|
-
branch[:path] = el.path
|
146
|
-
|
147
|
-
branch[:raw_word_count] = 0
|
148
|
-
branch[:word_count] = 0
|
149
|
-
branch[:child_count] = 0
|
150
|
-
branch[:bad_child_count] = 0
|
151
|
-
branch[:score_steps] = []
|
152
|
-
|
153
|
-
|
154
|
-
el.traverse do |subel|
|
155
|
-
div_at_end_of_branch = false if subel.name == "div"
|
156
|
-
path = subel.path
|
157
|
-
subels[path] ||= {}
|
158
|
-
subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
|
159
|
-
subels[path][:is_text] ||= subel.text?
|
160
|
-
|
161
|
-
if subels[path][:is_text]
|
162
|
-
subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
|
163
|
-
next if subels[path][:text].empty?
|
164
|
-
|
165
|
-
subels[path][:raw_word_count] ||= subels[path][:text].size
|
166
|
-
subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
|
167
|
-
subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
|
168
|
-
|
169
|
-
branch[:raw_word_count] += subels[path][:raw_word_count]
|
170
|
-
branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
|
171
|
-
end
|
172
|
-
|
173
|
-
subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
|
174
|
-
subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
|
175
|
-
subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
|
176
|
-
|
177
|
-
branch[:bad_child_count] += subels[path][:bad_child_count_inc]
|
178
|
-
branch[:child_count] += subels[path][:child_count_inc]
|
179
|
-
end
|
180
|
-
|
181
|
-
branch[:score] += 2 if branch[:name] == "div"
|
182
|
-
branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
|
183
|
-
branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
|
184
|
-
branch[:score] *= 3
|
185
|
-
|
186
|
-
|
187
|
-
branch[:score] *= 0.7 if el.children && el.children.size < 3
|
188
|
-
branch[:score] *= 1.25 if branch[:raw_word_count] > 10
|
189
|
-
next if branch[:raw_word_count] < 10
|
190
|
-
branch[:score] += [branch[:word_count], 1].max ** 0.5
|
191
|
-
|
192
|
-
|
193
|
-
word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
|
194
|
-
branch[:word_child_count_ratio] = word_child_count_ratio
|
195
|
-
|
196
|
-
if branch[:raw_word_count] > 100
|
197
|
-
good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
|
198
|
-
branch[:score] += good_word_ratio * 12
|
199
|
-
|
200
|
-
if word_child_count_ratio > 50
|
201
|
-
branch[:score] *= 1.5
|
202
|
-
elsif word_child_count_ratio > 30
|
203
|
-
branch[:score] *= 1.2
|
204
|
-
elsif word_child_count_ratio > 15
|
205
|
-
branch[:score] *= 1.1
|
206
|
-
elsif word_child_count_ratio < 4
|
207
|
-
branch[:score] *= 0.9
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
branch[:score_steps] << "s1: #{branch[:score]}"
|
212
|
-
|
213
|
-
bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
|
214
|
-
branch[:bad_child_ratio] = bad_child_ratio
|
215
|
-
branch[:score] += 3 if bad_child_ratio < 0.0
|
216
|
-
branch[:score] -= 3 if bad_child_ratio > 0.15
|
217
|
-
branch[:score] -= 2 if bad_child_ratio > 0.25
|
218
|
-
branch[:score] -= 2 if bad_child_ratio > 0.4
|
219
|
-
branch[:score] -= 4 if bad_child_ratio > 0.5
|
220
|
-
branch[:score] -= 5 if bad_child_ratio > 0.7
|
221
|
-
branch[:score] -= 5 if branch[:bad_child_count] > 20
|
222
|
-
|
223
|
-
branch[:score] += depth
|
224
|
-
branch[:score] *= 0.8 if ids.length > 10
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
@tree[el.path] = branch
|
229
|
-
end
|
230
|
-
|
231
|
-
|
232
|
-
sorted_tree = @tree.sort_by { |k, v| v[:score] }
|
233
|
-
|
234
|
-
#ap @doc.at(sorted_tree.first[0]).text
|
235
|
-
|
236
|
-
# Sort the branches by their score in reverse order
|
237
|
-
@content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
|
238
|
-
|
239
|
-
#ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
|
240
|
-
#t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
241
|
-
#puts t2 - t1
|
242
|
-
#exit
|
243
|
-
|
244
|
-
end
|
245
|
-
|
246
|
-
|
247
|
-
# Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
|
248
|
-
def content(clean = false, index = 0)
|
249
|
-
return @content[[clean, index]] if @content[[clean, index]]
|
250
|
-
return '' if !@content_candidates || @content_candidates.empty?
|
251
|
-
|
252
|
-
content_branch = @doc.at(@content_candidates[index].first)
|
253
|
-
orphans_to_remove = []
|
254
|
-
|
255
|
-
#ap content_branch.to_html
|
256
|
-
#exit
|
257
|
-
|
258
|
-
# Go through every piece of the content and rip out sections that contain too many tags compared to words
|
259
|
-
# This is usually indicative of "widgets" or link bar sections
|
260
|
-
content_branch.css('*').each_with_index do |el, i|
|
261
|
-
next unless el
|
262
|
-
|
263
|
-
if el.name == "h1"
|
264
|
-
el.remove
|
265
|
-
next
|
266
|
-
end
|
267
|
-
|
268
|
-
if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
|
269
|
-
el.remove
|
270
|
-
end
|
271
|
-
|
272
|
-
# Remove elements that contain words but there are more tags than words overall
|
273
|
-
# First, count the words
|
274
|
-
#word_count = 0
|
275
|
-
#el.traverse do |subel|
|
276
|
-
# if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
|
277
|
-
# word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
|
278
|
-
# end
|
279
|
-
#end
|
280
|
-
#
|
281
|
-
## .. then count the tags
|
282
|
-
#
|
283
|
-
#inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
|
284
|
-
#if word_count < inner_tags && inner_tags > 3 && word_count < 250
|
285
|
-
# puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
|
286
|
-
# #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
|
287
|
-
# el.remove
|
288
|
-
# next
|
289
|
-
#end
|
290
|
-
|
291
|
-
# If there are at least 2 words and a third of them are "meta words," remove the element
|
292
|
-
#inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
|
293
|
-
#if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
|
294
|
-
# if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
|
295
|
-
# el.remove
|
296
|
-
# end
|
297
|
-
#end
|
298
|
-
|
299
|
-
if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
|
300
|
-
el.remove
|
301
|
-
next
|
302
|
-
end
|
303
|
-
|
304
|
-
if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
|
305
|
-
el.remove
|
306
|
-
next
|
307
|
-
end
|
308
|
-
|
309
|
-
# If the ID or class of the element contains a fatally bad word, get rid of it
|
310
|
-
if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
|
311
|
-
#puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
|
312
|
-
el.remove
|
313
|
-
next
|
314
|
-
end
|
5
|
+
def self.create(raw_content, options = {})
|
6
|
+
type = options.delete(:reader)
|
7
|
+
case type
|
8
|
+
when :score
|
9
|
+
Pismo::Reader::Tree.new(raw_content, options)
|
10
|
+
when :cluster
|
11
|
+
Pismo::Reader::Cluster.new(raw_content, options)
|
12
|
+
else
|
13
|
+
Pismo::Reader::Tree.new(raw_content, options)
|
315
14
|
end
|
316
|
-
|
317
|
-
# If a title was found early in the result document but had text before it, remove that text - it's probably crap
|
318
|
-
orphans_to_remove.each { |el| el.remove }
|
319
|
-
|
320
|
-
# Clean up the HTML again - Nokogiri outputs it with full doctype and crap
|
321
|
-
clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
|
322
|
-
|
323
|
-
# If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
|
324
|
-
if clean
|
325
|
-
# Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
|
326
|
-
clean_html.gsub!(/<br.*?>/, "\n")
|
327
|
-
clean_html.gsub!(/<li>/, '* ')
|
328
|
-
clean_html.gsub!(/<\w+>/, '')
|
329
|
-
clean_html.gsub!(/<\/\w+>/, "\n")
|
330
|
-
clean_html.gsub!(/\ +/, ' ')
|
331
|
-
clean_html.gsub!(/^\s+\n/, "\n")
|
332
|
-
clean_html.gsub!(/\n{2,}/, "\n")
|
333
|
-
clean_html.strip!
|
334
|
-
end
|
335
|
-
|
336
|
-
# If tags butt up against each other across lines, remove the line break(s)
|
337
|
-
clean_html.gsub!(/\>\n+\</, '><')
|
338
|
-
|
339
|
-
# Get rid of images whose sources are relative (TODO: Make this optional)
|
340
|
-
clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
|
341
|
-
img_tag =~ /\Whttp/ ? img_tag : ''
|
342
|
-
end
|
343
|
-
|
344
|
-
# Remove empty tags
|
345
|
-
clean_html.gsub!(/<(\w+)><\/\1>/, "")
|
346
|
-
|
347
|
-
# Just a messy, hacky way to make output look nicer with subsequent paragraphs..
|
348
|
-
clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
|
349
|
-
|
350
|
-
@content[[clean, index]] = clean_html
|
351
15
|
end
|
352
|
-
|
353
|
-
def sentences(qty = 3)
|
354
|
-
clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
|
355
16
|
|
356
|
-
fodder = ''
|
357
|
-
doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
|
358
|
-
|
359
|
-
doc.traverse do |el|
|
360
|
-
path_segments = el.path.scan(/[a-z]+/)[2..-1]
|
361
|
-
next unless path_segments && path_segments.length > 1
|
362
|
-
if el.text? && el.text.strip.length < 3
|
363
|
-
el.remove
|
364
|
-
next
|
365
|
-
end
|
366
|
-
if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
|
367
|
-
text = el.text.strip
|
368
|
-
text += "." if text !~ /[\.\!\?\"\']$/
|
369
|
-
fodder += text + "\n"
|
370
|
-
end
|
371
|
-
end
|
372
|
-
|
373
|
-
fodder = content(true) if fodder.to_s.length < 50
|
374
|
-
fodder.gsub!(/\b\w\W\s/, '')
|
375
|
-
|
376
|
-
#sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
|
377
|
-
sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
|
378
|
-
|
379
|
-
sentences.compact!
|
380
|
-
sentences.map! { |s| s.strip }
|
381
|
-
sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
|
382
|
-
sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
|
383
|
-
sentences.map! { |s| s.gsub(/\s+/m, ' ') }
|
384
|
-
sentences.first(qty)
|
385
|
-
end
|
386
|
-
|
387
|
-
def images(qty = 3)
|
388
|
-
doc = Nokogiri::HTML(content, nil, 'utf-8')
|
389
|
-
images = []
|
390
|
-
doc.css("img").each do |img|
|
391
|
-
images << img['src']
|
392
|
-
break if images.length == qty
|
393
|
-
end
|
394
|
-
images
|
395
|
-
end
|
396
|
-
|
397
|
-
# Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
|
398
|
-
def strip(s)
|
399
|
-
s.gsub(/^\s+/, '').gsub(/\s+$/, '')
|
400
|
-
end
|
401
17
|
end
|
402
18
|
end
|
403
|
-
end
|
19
|
+
end
|