pismo 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ hash: 7
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
5
11
  platform: ruby
6
12
  authors:
7
13
  - Peter Cooper
@@ -9,69 +15,107 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-06-01 00:00:00 +01:00
18
+ date: 2010-06-20 00:00:00 +01:00
13
19
  default_executable: pismo
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
23
32
  version: "0"
24
- version:
33
+ type: :development
34
+ version_requirements: *id001
25
35
  - !ruby/object:Gem::Dependency
26
- name: nokogiri
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
36
+ name: awesome_print
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
30
40
  requirements:
31
41
  - - ">="
32
42
  - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
33
46
  version: "0"
34
- version:
47
+ type: :development
48
+ version_requirements: *id002
35
49
  - !ruby/object:Gem::Dependency
36
- name: loofah
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
50
+ name: jeweler
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
40
54
  requirements:
41
55
  - - ">="
42
56
  - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
43
60
  version: "0"
44
- version:
61
+ type: :runtime
62
+ version_requirements: *id003
45
63
  - !ruby/object:Gem::Dependency
46
- name: httparty
64
+ name: nokogiri
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
47
75
  type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
76
+ version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: sanitize
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ none: false
50
82
  requirements:
51
83
  - - ">="
52
84
  - !ruby/object:Gem::Version
85
+ hash: 3
86
+ segments:
87
+ - 0
53
88
  version: "0"
54
- version:
89
+ type: :runtime
90
+ version_requirements: *id005
55
91
  - !ruby/object:Gem::Dependency
56
92
  name: fast-stemmer
57
- type: :runtime
58
- version_requirement:
59
- version_requirements: !ruby/object:Gem::Requirement
93
+ prerelease: false
94
+ requirement: &id006 !ruby/object:Gem::Requirement
95
+ none: false
60
96
  requirements:
61
97
  - - ">="
62
98
  - !ruby/object:Gem::Version
99
+ hash: 3
100
+ segments:
101
+ - 0
63
102
  version: "0"
64
- version:
103
+ type: :runtime
104
+ version_requirements: *id006
65
105
  - !ruby/object:Gem::Dependency
66
106
  name: chronic
67
- type: :runtime
68
- version_requirement:
69
- version_requirements: !ruby/object:Gem::Requirement
107
+ prerelease: false
108
+ requirement: &id007 !ruby/object:Gem::Requirement
109
+ none: false
70
110
  requirements:
71
111
  - - ">="
72
112
  - !ruby/object:Gem::Version
113
+ hash: 3
114
+ segments:
115
+ - 0
73
116
  version: "0"
74
- version:
117
+ type: :runtime
118
+ version_requirements: *id007
75
119
  description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
76
120
  email: git@peterc.org
77
121
  executables:
@@ -85,6 +129,7 @@ files:
85
129
  - .document
86
130
  - .gitignore
87
131
  - LICENSE
132
+ - NOTICE
88
133
  - README.markdown
89
134
  - Rakefile
90
135
  - VERSION
@@ -93,25 +138,30 @@ files:
93
138
  - lib/pismo/document.rb
94
139
  - lib/pismo/external_attributes.rb
95
140
  - lib/pismo/internal_attributes.rb
96
- - lib/pismo/readability.rb
141
+ - lib/pismo/reader.rb
97
142
  - lib/pismo/stopwords.txt
98
143
  - pismo.gemspec
99
144
  - test/corpus/bbcnews.html
145
+ - test/corpus/bbcnews2.html
100
146
  - test/corpus/briancray.html
101
147
  - test/corpus/cant_read.html
102
148
  - test/corpus/factor.html
149
+ - test/corpus/gmane.html
103
150
  - test/corpus/huffington.html
104
151
  - test/corpus/metadata_expected.yaml
105
152
  - test/corpus/metadata_expected.yaml.old
153
+ - test/corpus/queness.html
154
+ - test/corpus/reader_expected.yaml
106
155
  - test/corpus/rubyinside.html
107
156
  - test/corpus/rww.html
108
157
  - test/corpus/spolsky.html
109
158
  - test/corpus/techcrunch.html
159
+ - test/corpus/tweet.html
110
160
  - test/corpus/youtube.html
161
+ - test/corpus/zefrank.html
111
162
  - test/helper.rb
112
163
  - test/test_corpus.rb
113
164
  - test/test_pismo_document.rb
114
- - test/test_readability.rb
115
165
  has_rdoc: true
116
166
  homepage: http://github.com/peterc/pismo
117
167
  licenses: []
@@ -122,21 +172,27 @@ rdoc_options:
122
172
  require_paths:
123
173
  - lib
124
174
  required_ruby_version: !ruby/object:Gem::Requirement
175
+ none: false
125
176
  requirements:
126
177
  - - ">="
127
178
  - !ruby/object:Gem::Version
179
+ hash: 3
180
+ segments:
181
+ - 0
128
182
  version: "0"
129
- version:
130
183
  required_rubygems_version: !ruby/object:Gem::Requirement
184
+ none: false
131
185
  requirements:
132
186
  - - ">="
133
187
  - !ruby/object:Gem::Version
188
+ hash: 3
189
+ segments:
190
+ - 0
134
191
  version: "0"
135
- version:
136
192
  requirements: []
137
193
 
138
194
  rubyforge_project:
139
- rubygems_version: 1.3.5
195
+ rubygems_version: 1.3.7
140
196
  signing_key:
141
197
  specification_version: 3
142
198
  summary: Extracts or retrieves content-related metadata from HTML pages
@@ -144,4 +200,3 @@ test_files:
144
200
  - test/helper.rb
145
201
  - test/test_corpus.rb
146
202
  - test/test_pismo_document.rb
147
- - test/test_readability.rb
@@ -1,342 +0,0 @@
1
- # This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
- #
3
- # This is a Ruby port of arc90's readability project
4
- # http://lab.arc90.com/experiments/readability/
5
- # Given a html document, it pulls out the main body text and cleans it up.
6
- # Ruby port by starrhorne and iterationlabs
7
- #
8
- # Original JavaScript version:
9
- # http://lab.arc90.com/experiments/readability/js/readability.js
10
- # * Copyright (c) 2009 Arc90 Inc
11
- # * Readability is licensed under the Apache License, Version 2.0.
12
- #
13
- # Minor edits and tweaks by Peter Cooper
14
-
15
- require 'nokogiri'
16
-
17
- IS_RUBY19 = "a".respond_to?(:encoding)
18
-
19
- module Readability
20
- class Document
21
- TEXT_LENGTH_THRESHOLD = 25
22
- RETRY_LENGTH = 250
23
-
24
- attr_accessor :options, :html
25
-
26
- def initialize(input, options = {})
27
- @input = input
28
- @options = options
29
- make_html
30
- end
31
-
32
- def make_html
33
- @html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
34
- end
35
-
36
- REGEXES = {
37
- :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
38
- :okMaybeItsACandidateRe => /and|article|body|column|main/i,
39
- :positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
40
- :negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
41
- :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
42
- :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
43
- :replaceFontsRe => /<(\/?)font[^>]*>/i,
44
- :trimRe => /^\s+|\s+$/,
45
- :normalizeRe => /\s{2,}/,
46
- :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
47
- :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
48
- }
49
-
50
- def content(remove_unlikely_candidates = true)
51
- @html.css("script, style").each { |i| i.remove }
52
-
53
- remove_unlikely_candidates! if remove_unlikely_candidates
54
- transform_misused_divs_into_paragraphs!
55
- candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
56
- best_candidate = select_best_candidate(candidates)
57
- article = get_article(candidates, best_candidate)
58
- cleaned_article = sanitize(article, candidates, options)
59
- cleaned_article.gsub!(/^\s+\n/, "\n")
60
- cleaned_article.gsub!(/[\ \t]+/, ' ')
61
- cleaned_article.gsub!(/^\s+/, '')
62
- cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
63
- if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
64
- make_html
65
- content(false)
66
- else
67
- cleaned_article
68
- end
69
- end
70
-
71
- def get_article(candidates, best_candidate)
72
- # Now that we have the top candidate, look through its siblings for content that might also be related.
73
- # Things like preambles, content split by ads that we removed, etc.
74
-
75
- sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
76
- output = Nokogiri::XML::Node.new('div', @html)
77
-
78
- return output unless best_candidate[:elem]
79
-
80
- best_candidate[:elem].parent.children.each do |sibling|
81
- append = false
82
- append = true if sibling == best_candidate[:elem]
83
- append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
84
-
85
- if sibling.name.downcase == "p"
86
- link_density = get_link_density(sibling)
87
- node_content = sibling.text
88
- node_length = node_content.length
89
-
90
- if node_length > 80 && link_density < 0.25
91
- append = true
92
- elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
93
- append = true
94
- end
95
- end
96
-
97
- if append
98
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
99
- output << sibling
100
- end
101
- end
102
-
103
- output
104
- end
105
-
106
- def select_best_candidate(candidates)
107
- sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
108
-
109
- debug("Top 5 canidates:")
110
- sorted_candidates[0...5].each do |candidate|
111
- debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
112
- end
113
-
114
- best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
115
- #debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
116
-
117
- best_candidate
118
- end
119
-
120
- def get_link_density(elem)
121
- link_length = elem.css("a").map {|i| i.text}.join("").length
122
- text_length = elem.text.length
123
- link_length / text_length.to_f
124
- end
125
-
126
- def score_paragraphs(min_text_length)
127
- candidates = {}
128
- @html.css("p,td").each do |elem|
129
- parent_node = elem.parent
130
- grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
131
- inner_text = elem.text
132
-
133
- # If this paragraph is less than 25 characters, don't even count it.
134
- next if inner_text.length < min_text_length
135
-
136
- candidates[parent_node] ||= score_node(parent_node)
137
- candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
138
-
139
- content_score = 1
140
-
141
- begin
142
- content_score += inner_text.split(',').length
143
- content_score += [(inner_text.length / 100).to_i, 3].min
144
- rescue => e
145
- raise e unless IS_RUBY19
146
- inner_text.force_encoding('ASCII-8BIT')
147
- content_score += inner_text.split(',').length
148
- content_score += [(inner_text.length / 100).to_i, 3].min
149
- end
150
-
151
- candidates[parent_node][:content_score] += content_score
152
- candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
153
- end
154
-
155
- # Scale the final candidates score based on link density. Good content should have a
156
- # relatively small link density (5% or less) and be mostly unaffected by this operation.
157
- candidates.each do |elem, candidate|
158
- candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
159
- end
160
-
161
- candidates
162
- end
163
-
164
- def class_weight(e)
165
- weight = 0
166
- if e[:class] && e[:class] != ""
167
- if e[:class] =~ REGEXES[:negativeRe]
168
- weight -= 25
169
- end
170
-
171
- if e[:class] =~ REGEXES[:positiveRe]
172
- weight += 25
173
- end
174
- end
175
-
176
- if e[:id] && e[:id] != ""
177
- if e[:id] =~ REGEXES[:negativeRe]
178
- weight -= 25
179
- end
180
-
181
- if e[:id] =~ REGEXES[:positiveRe]
182
- weight += 25
183
- end
184
- end
185
-
186
- weight
187
- end
188
-
189
- def score_node(elem)
190
- content_score = class_weight(elem)
191
- case elem.name.downcase
192
- when "div"
193
- content_score += 5
194
- when "blockquote"
195
- content_score += 3
196
- when "form"
197
- content_score -= 3
198
- when "th"
199
- content_score -= 5
200
- end
201
- { :content_score => content_score, :elem => elem }
202
- end
203
-
204
- def debug(str)
205
- puts str if options[:debug]
206
- end
207
-
208
- def remove_unlikely_candidates!
209
- @html.css("*").each do |elem|
210
- str = "#{elem[:class]}#{elem[:id]}"
211
- if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
212
- debug("Removing unlikely candidate - #{str}")
213
- elem.remove
214
- end
215
- end
216
- end
217
-
218
- def transform_misused_divs_into_paragraphs!
219
- @html.css("*").each do |elem|
220
- if elem.name.downcase == "div"
221
- # transform <div>s that do not contain other block elements into <p>s
222
- elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
223
- if elem_inner_html !~ REGEXES[:divToPElementsRe]
224
- debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
225
- elem.name = "p"
226
- end
227
- else
228
- # wrap text nodes in p tags
229
- # elem.children.each do |child|
230
- # if child.text?
231
- ## debug("wrapping text node with a p")
232
- # child.swap("<p>#{child.text}</p>")
233
- # end
234
- # end
235
- end
236
- end
237
- end
238
-
239
- def sanitize(node, candidates, options = {})
240
- node.css("h1, h2, h3, h4, h5, h6").each do |header|
241
- header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
242
- end
243
-
244
- node.css("form, object, iframe, embed").each do |elem|
245
- elem.remove
246
- end
247
-
248
- # Remove empty <p> tags
249
- node.css("p").each do |elem|
250
- elem.remove if elem.content.strip.empty?
251
- end
252
-
253
- # Remove empty <div> tags
254
- node.css("div").each do |elem|
255
- elem.remove if elem.content.strip.empty?
256
- end
257
-
258
-
259
-
260
- # Conditionally clean <table>s, <ul>s, and <div>s
261
- node.css("table, ul, div").each do |el|
262
- weight = class_weight(el)
263
- content_score = candidates[el] ? candidates[el][:content_score] : 0
264
- name = el.name.downcase
265
-
266
- if weight + content_score < 0
267
- el.remove
268
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
269
- elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
270
- counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
271
- counts["li"] -= 100
272
-
273
- content_length = el.text.length
274
- link_density = get_link_density(el)
275
- to_remove = false
276
- reason = ""
277
-
278
- if counts["img"] > counts["p"]
279
- reason = "too many images"
280
- to_remove = true
281
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
282
- reason = "more <li>s than <p>s"
283
- to_remove = true
284
- elsif counts["input"] > (counts["p"] / 3).to_i
285
- reason = "less than 3x <p>s than <input>s"
286
- to_remove = true
287
- elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
288
- reason = "too short a content length without a single image"
289
- to_remove = true
290
- elsif weight < 25 && link_density > 0.2
291
- reason = "too many links for its weight (#{weight})"
292
- to_remove = true
293
- elsif weight >= 25 && link_density > 0.5
294
- reason = "too many links for its weight (#{weight})"
295
- to_remove = true
296
- elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
297
- reason = "<embed>s with too short a content length, or too many <embed>s"
298
- to_remove = true
299
- end
300
-
301
- if to_remove
302
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
303
- el.remove
304
- end
305
- end
306
- end
307
-
308
- # We'll sanitize all elements using a whitelist
309
- whitelist = @options[:tags] || %w[p]
310
-
311
- # Use a hash for speed (don't want to make a million calls to include?)
312
- whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
313
-
314
- ([node] + node.css("*")).each do |el|
315
-
316
- # If element is in whitelist, delete all its attributes
317
- if whitelist[el.node_name]
318
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
319
-
320
- # Otherwise, replace the element with its contents
321
- else
322
- begin
323
- el.swap(el.text)
324
- rescue => e
325
- raise e unless IS_RUBY19
326
- el.swap(el.text.force_encoding("ASCII-8BIT"))
327
- end
328
- end
329
-
330
- end
331
-
332
- # Get rid of duplicate whitespace
333
- begin
334
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
335
- rescue => e
336
- raise e unless IS_RUBY19
337
- node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
338
- end
339
- end
340
-
341
- end
342
- end