pismo 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ hash: 7
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
5
11
  platform: ruby
6
12
  authors:
7
13
  - Peter Cooper
@@ -9,69 +15,107 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-06-01 00:00:00 +01:00
18
+ date: 2010-06-20 00:00:00 +01:00
13
19
  default_executable: pismo
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
23
32
  version: "0"
24
- version:
33
+ type: :development
34
+ version_requirements: *id001
25
35
  - !ruby/object:Gem::Dependency
26
- name: nokogiri
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
36
+ name: awesome_print
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
30
40
  requirements:
31
41
  - - ">="
32
42
  - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
33
46
  version: "0"
34
- version:
47
+ type: :development
48
+ version_requirements: *id002
35
49
  - !ruby/object:Gem::Dependency
36
- name: loofah
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
50
+ name: jeweler
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
40
54
  requirements:
41
55
  - - ">="
42
56
  - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
43
60
  version: "0"
44
- version:
61
+ type: :runtime
62
+ version_requirements: *id003
45
63
  - !ruby/object:Gem::Dependency
46
- name: httparty
64
+ name: nokogiri
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
47
75
  type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
76
+ version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: sanitize
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ none: false
50
82
  requirements:
51
83
  - - ">="
52
84
  - !ruby/object:Gem::Version
85
+ hash: 3
86
+ segments:
87
+ - 0
53
88
  version: "0"
54
- version:
89
+ type: :runtime
90
+ version_requirements: *id005
55
91
  - !ruby/object:Gem::Dependency
56
92
  name: fast-stemmer
57
- type: :runtime
58
- version_requirement:
59
- version_requirements: !ruby/object:Gem::Requirement
93
+ prerelease: false
94
+ requirement: &id006 !ruby/object:Gem::Requirement
95
+ none: false
60
96
  requirements:
61
97
  - - ">="
62
98
  - !ruby/object:Gem::Version
99
+ hash: 3
100
+ segments:
101
+ - 0
63
102
  version: "0"
64
- version:
103
+ type: :runtime
104
+ version_requirements: *id006
65
105
  - !ruby/object:Gem::Dependency
66
106
  name: chronic
67
- type: :runtime
68
- version_requirement:
69
- version_requirements: !ruby/object:Gem::Requirement
107
+ prerelease: false
108
+ requirement: &id007 !ruby/object:Gem::Requirement
109
+ none: false
70
110
  requirements:
71
111
  - - ">="
72
112
  - !ruby/object:Gem::Version
113
+ hash: 3
114
+ segments:
115
+ - 0
73
116
  version: "0"
74
- version:
117
+ type: :runtime
118
+ version_requirements: *id007
75
119
  description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
76
120
  email: git@peterc.org
77
121
  executables:
@@ -85,6 +129,7 @@ files:
85
129
  - .document
86
130
  - .gitignore
87
131
  - LICENSE
132
+ - NOTICE
88
133
  - README.markdown
89
134
  - Rakefile
90
135
  - VERSION
@@ -93,25 +138,30 @@ files:
93
138
  - lib/pismo/document.rb
94
139
  - lib/pismo/external_attributes.rb
95
140
  - lib/pismo/internal_attributes.rb
96
- - lib/pismo/readability.rb
141
+ - lib/pismo/reader.rb
97
142
  - lib/pismo/stopwords.txt
98
143
  - pismo.gemspec
99
144
  - test/corpus/bbcnews.html
145
+ - test/corpus/bbcnews2.html
100
146
  - test/corpus/briancray.html
101
147
  - test/corpus/cant_read.html
102
148
  - test/corpus/factor.html
149
+ - test/corpus/gmane.html
103
150
  - test/corpus/huffington.html
104
151
  - test/corpus/metadata_expected.yaml
105
152
  - test/corpus/metadata_expected.yaml.old
153
+ - test/corpus/queness.html
154
+ - test/corpus/reader_expected.yaml
106
155
  - test/corpus/rubyinside.html
107
156
  - test/corpus/rww.html
108
157
  - test/corpus/spolsky.html
109
158
  - test/corpus/techcrunch.html
159
+ - test/corpus/tweet.html
110
160
  - test/corpus/youtube.html
161
+ - test/corpus/zefrank.html
111
162
  - test/helper.rb
112
163
  - test/test_corpus.rb
113
164
  - test/test_pismo_document.rb
114
- - test/test_readability.rb
115
165
  has_rdoc: true
116
166
  homepage: http://github.com/peterc/pismo
117
167
  licenses: []
@@ -122,21 +172,27 @@ rdoc_options:
122
172
  require_paths:
123
173
  - lib
124
174
  required_ruby_version: !ruby/object:Gem::Requirement
175
+ none: false
125
176
  requirements:
126
177
  - - ">="
127
178
  - !ruby/object:Gem::Version
179
+ hash: 3
180
+ segments:
181
+ - 0
128
182
  version: "0"
129
- version:
130
183
  required_rubygems_version: !ruby/object:Gem::Requirement
184
+ none: false
131
185
  requirements:
132
186
  - - ">="
133
187
  - !ruby/object:Gem::Version
188
+ hash: 3
189
+ segments:
190
+ - 0
134
191
  version: "0"
135
- version:
136
192
  requirements: []
137
193
 
138
194
  rubyforge_project:
139
- rubygems_version: 1.3.5
195
+ rubygems_version: 1.3.7
140
196
  signing_key:
141
197
  specification_version: 3
142
198
  summary: Extracts or retrieves content-related metadata from HTML pages
@@ -144,4 +200,3 @@ test_files:
144
200
  - test/helper.rb
145
201
  - test/test_corpus.rb
146
202
  - test/test_pismo_document.rb
147
- - test/test_readability.rb
@@ -1,342 +0,0 @@
1
- # This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
- #
3
- # This is a Ruby port of arc90's readability project
4
- # http://lab.arc90.com/experiments/readability/
5
- # Given a html document, it pulls out the main body text and cleans it up.
6
- # Ruby port by starrhorne and iterationlabs
7
- #
8
- # Original JavaScript version:
9
- # http://lab.arc90.com/experiments/readability/js/readability.js
10
- # * Copyright (c) 2009 Arc90 Inc
11
- # * Readability is licensed under the Apache License, Version 2.0.
12
- #
13
- # Minor edits and tweaks by Peter Cooper
14
-
15
- require 'nokogiri'
16
-
17
- IS_RUBY19 = "a".respond_to?(:encoding)
18
-
19
- module Readability
20
- class Document
21
- TEXT_LENGTH_THRESHOLD = 25
22
- RETRY_LENGTH = 250
23
-
24
- attr_accessor :options, :html
25
-
26
- def initialize(input, options = {})
27
- @input = input
28
- @options = options
29
- make_html
30
- end
31
-
32
- def make_html
33
- @html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
34
- end
35
-
36
- REGEXES = {
37
- :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
38
- :okMaybeItsACandidateRe => /and|article|body|column|main/i,
39
- :positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
40
- :negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
41
- :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
42
- :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
43
- :replaceFontsRe => /<(\/?)font[^>]*>/i,
44
- :trimRe => /^\s+|\s+$/,
45
- :normalizeRe => /\s{2,}/,
46
- :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
47
- :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
48
- }
49
-
50
- def content(remove_unlikely_candidates = true)
51
- @html.css("script, style").each { |i| i.remove }
52
-
53
- remove_unlikely_candidates! if remove_unlikely_candidates
54
- transform_misused_divs_into_paragraphs!
55
- candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
56
- best_candidate = select_best_candidate(candidates)
57
- article = get_article(candidates, best_candidate)
58
- cleaned_article = sanitize(article, candidates, options)
59
- cleaned_article.gsub!(/^\s+\n/, "\n")
60
- cleaned_article.gsub!(/[\ \t]+/, ' ')
61
- cleaned_article.gsub!(/^\s+/, '')
62
- cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
63
- if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
64
- make_html
65
- content(false)
66
- else
67
- cleaned_article
68
- end
69
- end
70
-
71
- def get_article(candidates, best_candidate)
72
- # Now that we have the top candidate, look through its siblings for content that might also be related.
73
- # Things like preambles, content split by ads that we removed, etc.
74
-
75
- sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
76
- output = Nokogiri::XML::Node.new('div', @html)
77
-
78
- return output unless best_candidate[:elem]
79
-
80
- best_candidate[:elem].parent.children.each do |sibling|
81
- append = false
82
- append = true if sibling == best_candidate[:elem]
83
- append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
84
-
85
- if sibling.name.downcase == "p"
86
- link_density = get_link_density(sibling)
87
- node_content = sibling.text
88
- node_length = node_content.length
89
-
90
- if node_length > 80 && link_density < 0.25
91
- append = true
92
- elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
93
- append = true
94
- end
95
- end
96
-
97
- if append
98
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
99
- output << sibling
100
- end
101
- end
102
-
103
- output
104
- end
105
-
106
- def select_best_candidate(candidates)
107
- sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
108
-
109
- debug("Top 5 canidates:")
110
- sorted_candidates[0...5].each do |candidate|
111
- debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
112
- end
113
-
114
- best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
115
- #debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
116
-
117
- best_candidate
118
- end
119
-
120
- def get_link_density(elem)
121
- link_length = elem.css("a").map {|i| i.text}.join("").length
122
- text_length = elem.text.length
123
- link_length / text_length.to_f
124
- end
125
-
126
- def score_paragraphs(min_text_length)
127
- candidates = {}
128
- @html.css("p,td").each do |elem|
129
- parent_node = elem.parent
130
- grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
131
- inner_text = elem.text
132
-
133
- # If this paragraph is less than 25 characters, don't even count it.
134
- next if inner_text.length < min_text_length
135
-
136
- candidates[parent_node] ||= score_node(parent_node)
137
- candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
138
-
139
- content_score = 1
140
-
141
- begin
142
- content_score += inner_text.split(',').length
143
- content_score += [(inner_text.length / 100).to_i, 3].min
144
- rescue => e
145
- raise e unless IS_RUBY19
146
- inner_text.force_encoding('ASCII-8BIT')
147
- content_score += inner_text.split(',').length
148
- content_score += [(inner_text.length / 100).to_i, 3].min
149
- end
150
-
151
- candidates[parent_node][:content_score] += content_score
152
- candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
153
- end
154
-
155
- # Scale the final candidates score based on link density. Good content should have a
156
- # relatively small link density (5% or less) and be mostly unaffected by this operation.
157
- candidates.each do |elem, candidate|
158
- candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
159
- end
160
-
161
- candidates
162
- end
163
-
164
- def class_weight(e)
165
- weight = 0
166
- if e[:class] && e[:class] != ""
167
- if e[:class] =~ REGEXES[:negativeRe]
168
- weight -= 25
169
- end
170
-
171
- if e[:class] =~ REGEXES[:positiveRe]
172
- weight += 25
173
- end
174
- end
175
-
176
- if e[:id] && e[:id] != ""
177
- if e[:id] =~ REGEXES[:negativeRe]
178
- weight -= 25
179
- end
180
-
181
- if e[:id] =~ REGEXES[:positiveRe]
182
- weight += 25
183
- end
184
- end
185
-
186
- weight
187
- end
188
-
189
- def score_node(elem)
190
- content_score = class_weight(elem)
191
- case elem.name.downcase
192
- when "div"
193
- content_score += 5
194
- when "blockquote"
195
- content_score += 3
196
- when "form"
197
- content_score -= 3
198
- when "th"
199
- content_score -= 5
200
- end
201
- { :content_score => content_score, :elem => elem }
202
- end
203
-
204
- def debug(str)
205
- puts str if options[:debug]
206
- end
207
-
208
- def remove_unlikely_candidates!
209
- @html.css("*").each do |elem|
210
- str = "#{elem[:class]}#{elem[:id]}"
211
- if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
212
- debug("Removing unlikely candidate - #{str}")
213
- elem.remove
214
- end
215
- end
216
- end
217
-
218
- def transform_misused_divs_into_paragraphs!
219
- @html.css("*").each do |elem|
220
- if elem.name.downcase == "div"
221
- # transform <div>s that do not contain other block elements into <p>s
222
- elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
223
- if elem_inner_html !~ REGEXES[:divToPElementsRe]
224
- debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
225
- elem.name = "p"
226
- end
227
- else
228
- # wrap text nodes in p tags
229
- # elem.children.each do |child|
230
- # if child.text?
231
- ## debug("wrapping text node with a p")
232
- # child.swap("<p>#{child.text}</p>")
233
- # end
234
- # end
235
- end
236
- end
237
- end
238
-
239
- def sanitize(node, candidates, options = {})
240
- node.css("h1, h2, h3, h4, h5, h6").each do |header|
241
- header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
242
- end
243
-
244
- node.css("form, object, iframe, embed").each do |elem|
245
- elem.remove
246
- end
247
-
248
- # Remove empty <p> tags
249
- node.css("p").each do |elem|
250
- elem.remove if elem.content.strip.empty?
251
- end
252
-
253
- # Remove empty <div> tags
254
- node.css("div").each do |elem|
255
- elem.remove if elem.content.strip.empty?
256
- end
257
-
258
-
259
-
260
- # Conditionally clean <table>s, <ul>s, and <div>s
261
- node.css("table, ul, div").each do |el|
262
- weight = class_weight(el)
263
- content_score = candidates[el] ? candidates[el][:content_score] : 0
264
- name = el.name.downcase
265
-
266
- if weight + content_score < 0
267
- el.remove
268
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
269
- elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
270
- counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
271
- counts["li"] -= 100
272
-
273
- content_length = el.text.length
274
- link_density = get_link_density(el)
275
- to_remove = false
276
- reason = ""
277
-
278
- if counts["img"] > counts["p"]
279
- reason = "too many images"
280
- to_remove = true
281
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
282
- reason = "more <li>s than <p>s"
283
- to_remove = true
284
- elsif counts["input"] > (counts["p"] / 3).to_i
285
- reason = "less than 3x <p>s than <input>s"
286
- to_remove = true
287
- elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
288
- reason = "too short a content length without a single image"
289
- to_remove = true
290
- elsif weight < 25 && link_density > 0.2
291
- reason = "too many links for its weight (#{weight})"
292
- to_remove = true
293
- elsif weight >= 25 && link_density > 0.5
294
- reason = "too many links for its weight (#{weight})"
295
- to_remove = true
296
- elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
297
- reason = "<embed>s with too short a content length, or too many <embed>s"
298
- to_remove = true
299
- end
300
-
301
- if to_remove
302
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
303
- el.remove
304
- end
305
- end
306
- end
307
-
308
- # We'll sanitize all elements using a whitelist
309
- whitelist = @options[:tags] || %w[p]
310
-
311
- # Use a hash for speed (don't want to make a million calls to include?)
312
- whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
313
-
314
- ([node] + node.css("*")).each do |el|
315
-
316
- # If element is in whitelist, delete all its attributes
317
- if whitelist[el.node_name]
318
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
319
-
320
- # Otherwise, replace the element with its contents
321
- else
322
- begin
323
- el.swap(el.text)
324
- rescue => e
325
- raise e unless IS_RUBY19
326
- el.swap(el.text.force_encoding("ASCII-8BIT"))
327
- end
328
- end
329
-
330
- end
331
-
332
- # Get rid of duplicate whitespace
333
- begin
334
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
335
- rescue => e
336
- raise e unless IS_RUBY19
337
- node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
338
- end
339
- end
340
-
341
- end
342
- end