pismo 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,390 @@
1
+ require 'nokogiri'
2
+ require 'sanitize'
3
+
4
+ module Pismo
5
+ module Reader
6
+ class Document
7
+ attr_reader :raw_content, :doc, :content_candidates
8
+
9
+ # Elements to keep for /input/ sanitization
10
+ OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd}
11
+
12
+ # Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
13
+ OK_ATTRIBUTES = {}
14
+ OK_CLEAN_ATTRIBUTES = {}
15
+ OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
16
+ OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
17
+
18
+
19
+ # Words that we'd like to see in class and ID names for "content"
20
+ GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
21
+
22
+ # Words that indicate crap in general
23
+ BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor snap nopreview ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor}.uniq
24
+
25
+ # Words that kill a branch dead
26
+ FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
27
+
28
+ META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
29
+
30
+ WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
31
+ COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
32
+
33
+ ## Output sanitization element sets
34
+ BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
35
+ INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
36
+ OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
37
+ NON_HEADER_ELEMENTS = %w{p br}
38
+
39
+ # Create a document object based on the raw HTML content provided
40
+ def initialize(raw_content)
41
+ @raw_content = raw_content
42
+ build_doc
43
+ end
44
+
45
+ def build_doc
46
+ @content = {}
47
+
48
+ if RUBY_VERSION > "1.9"
49
+ @raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
50
+ @raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
51
+ end
52
+
53
+ # Normalize whitespace (as much to make debugging sessions look nice as anything else)
54
+ @raw_content.gsub!(/\s{2,}/, ' ')
55
+ @raw_content.gsub!(/\r/, "\n")
56
+ @raw_content.gsub!(/\n{3,}/, "\n\n")
57
+ @raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
58
+
59
+ # Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
60
+ @raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
61
+
62
+ # Sanitize the HTML
63
+ @raw_content = Sanitize.clean(@raw_content,
64
+ :elements => OK_ELEMENTS,
65
+ :attributes => OK_ATTRIBUTES,
66
+ :remove_contents => true,
67
+ :output_encoding => 'utf-8'
68
+ )
69
+
70
+ @doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
71
+
72
+ build_analysis_tree
73
+ end
74
+
75
+
76
+ # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
77
+ def build_analysis_tree
78
+ @tree = {}
79
+ subels = {}
80
+
81
+ t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
82
+
83
+ # Do a pre clean up of elements.
84
+ @doc.css("div, span, table, tr, td, pre").each do |el|
85
+ # Any block elements with no child block elements can become paragraphs
86
+ if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
87
+ el.name = "p"
88
+ elsif el.name != "span"
89
+ el.name = "div"
90
+ end
91
+
92
+ # Any SPANs that aren't within paragraphs can become paragraphs too
93
+ el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
94
+
95
+ el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
96
+ end
97
+
98
+ @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
99
+ # Assume that no content we'll want comes in a total package of fewer than 80 characters!
100
+ next unless el.text.to_s.strip.length >= 80
101
+
102
+ ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
103
+ path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
104
+ depth = path_segments.length
105
+
106
+ branch = {}
107
+ branch[:ids] = ids
108
+ branch[:score] = -(BAD_WORDS & ids).size
109
+ branch[:score] += (GOOD_WORDS & ids).size
110
+ next if branch[:score] < 0
111
+
112
+ #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
113
+
114
+ # Elements that have an ID or class are more likely to be our winners
115
+ branch[:score] += 2 unless ids.empty?
116
+
117
+ branch[:name] = el.name
118
+ branch[:depth] = depth
119
+ branch[:path] = el.path
120
+
121
+ branch[:raw_word_count] = 0
122
+ branch[:word_count] = 0
123
+ branch[:child_count] = 0
124
+ branch[:bad_child_count] = 0
125
+ branch[:score_steps] = []
126
+
127
+
128
+ el.traverse do |subel|
129
+ div_at_end_of_branch = false if subel.name == "div"
130
+ path = subel.path
131
+ subels[path] ||= {}
132
+ subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
133
+ subels[path][:is_text] ||= subel.text?
134
+
135
+ if subels[path][:is_text]
136
+ subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
137
+ next if subels[path][:text].empty?
138
+
139
+ subels[path][:raw_word_count] ||= subels[path][:text].size
140
+ subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
141
+ subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
142
+
143
+ branch[:raw_word_count] += subels[path][:raw_word_count]
144
+ branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
145
+ end
146
+
147
+ subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
148
+ subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
149
+ subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
150
+
151
+ branch[:bad_child_count] += subels[path][:bad_child_count_inc]
152
+ branch[:child_count] += subels[path][:child_count_inc]
153
+ end
154
+
155
+ branch[:score] += 2 if branch[:name] == "div"
156
+ branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
157
+ branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
158
+ branch[:score] *= 3
159
+
160
+
161
+ branch[:score] *= 0.7 if el.children && el.children.size < 3
162
+ branch[:score] *= 1.25 if branch[:raw_word_count] > 10
163
+ next if branch[:raw_word_count] < 10
164
+ branch[:score] += [branch[:word_count], 1].max ** 0.5
165
+
166
+
167
+ word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
168
+ branch[:word_child_count_ratio] = word_child_count_ratio
169
+
170
+ if branch[:raw_word_count] > 100
171
+ good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
172
+ branch[:score] += good_word_ratio * 12
173
+
174
+ if word_child_count_ratio > 50
175
+ branch[:score] *= 1.5
176
+ elsif word_child_count_ratio > 30
177
+ branch[:score] *= 1.2
178
+ elsif word_child_count_ratio > 15
179
+ branch[:score] *= 1.1
180
+ elsif word_child_count_ratio < 4
181
+ branch[:score] *= 0.9
182
+ end
183
+ end
184
+
185
+ branch[:score_steps] << "s1: #{branch[:score]}"
186
+
187
+ bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
188
+ branch[:bad_child_ratio] = bad_child_ratio
189
+ branch[:score] += 3 if bad_child_ratio < 0.0
190
+ branch[:score] -= 3 if bad_child_ratio > 0.15
191
+ branch[:score] -= 2 if bad_child_ratio > 0.25
192
+ branch[:score] -= 2 if bad_child_ratio > 0.4
193
+ branch[:score] -= 4 if bad_child_ratio > 0.5
194
+ branch[:score] -= 5 if bad_child_ratio > 0.7
195
+ branch[:score] -= 5 if branch[:bad_child_count] > 20
196
+
197
+ branch[:score] += depth
198
+
199
+
200
+
201
+ @tree[el.path] = branch
202
+ end
203
+
204
+
205
+ sorted_tree = @tree.sort_by { |k, v| v[:score] }
206
+
207
+ #ap @doc.at(sorted_tree.first[0]).text
208
+
209
+ # Sort the branches by their score in reverse order
210
+ @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
211
+
212
+ @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
213
+ #ap @content_candidates
214
+ #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
215
+ #puts t2 - t1
216
+ #exit
217
+
218
+ end
219
+
220
+
221
+ # Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
222
+ def content(clean = false, index = 0)
223
+ return @content[[clean, index]] if @content[[clean, index]]
224
+ return '' unless @content_candidates && !@content_candidates.empty?
225
+
226
+ content_branch = @doc.at(@content_candidates[index].first)
227
+ orphans_to_remove = []
228
+
229
+ #ap content_branch.to_html
230
+ #exit
231
+
232
+ # Go through every piece of the content and rip out sections that contain too many tags compared to words
233
+ # This is usually indicative of "widgets" or link bar sections
234
+ content_branch.css('*').each_with_index do |el, i|
235
+ next unless el
236
+
237
+ if el.name == "h1"
238
+ el.remove
239
+ next
240
+ end
241
+
242
+ if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
243
+ el.remove
244
+ end
245
+
246
+ # Remove elements that contain words but there are more tags than words overall
247
+ # First, count the words
248
+ #word_count = 0
249
+ #el.traverse do |subel|
250
+ # if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
251
+ # word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
252
+ # end
253
+ #end
254
+ #
255
+ ## .. then count the tags
256
+ #
257
+ #inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
258
+ #if word_count < inner_tags && inner_tags > 3 && word_count < 250
259
+ # puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
260
+ # #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
261
+ # el.remove
262
+ # next
263
+ #end
264
+
265
+ # If there are at least 2 words and a third of them are "meta words," remove the element
266
+ #inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
267
+ #if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
268
+ # if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
269
+ # el.remove
270
+ # end
271
+ #end
272
+
273
+ if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
274
+ el.remove
275
+ next
276
+ end
277
+
278
+ if el.name == "p" && el.text !~ /\.(\s|$)/ && el.inner_html !~ /\<img/
279
+ el.remove
280
+ next
281
+ end
282
+
283
+ # If the ID or class of the element contains a fatally bad word, get rid of it
284
+ if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
285
+ #puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
286
+ el.remove
287
+ next
288
+ end
289
+ end
290
+
291
+ # If a title was found early in the result document but had text before it, remove that text - it's probably crap
292
+ orphans_to_remove.each { |el| el.remove }
293
+
294
+ # Clean up the HTML again - Nokogiri outputs it with full doctype and crap
295
+ clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
296
+
297
+ # If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
298
+ if clean
299
+ # Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
300
+ clean_html.gsub!(/<br.*?>/, "\n")
301
+ clean_html.gsub!(/<li>/, '* ')
302
+ clean_html.gsub!(/<\w+>/, '')
303
+ clean_html.gsub!(/<\/\w+>/, "\n")
304
+ clean_html.gsub!(/\ +/, ' ')
305
+ clean_html.gsub!(/^\s+\n/, "\n")
306
+ clean_html.gsub!(/\n{2,}/, "\n")
307
+ clean_html.strip!
308
+ end
309
+
310
+ # If tags butt up against each other across lines, remove the line break(s)
311
+ clean_html.gsub!(/\>\n+\</, '><')
312
+
313
+ # Get rid of images whose sources are relative (TODO: Make this optional)
314
+ clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
315
+ img_tag =~ /\Whttp/ ? img_tag : ''
316
+ end
317
+
318
+ # Remove empty tags
319
+ clean_html.gsub!(/<(\w+)><\/\1>/, "")
320
+
321
+ # Trim leading space from lines but without removing blank lines
322
+ #clean_html.gsub!(/^\ +(?=\S)/, '')
323
+
324
+ # Just a messy, hacky way to make output look nicer with subsequent paragraphs..
325
+ clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
326
+
327
+ # Get rid of bullshit "smart" quotes
328
+ clean_html.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
329
+ clean_html.gsub!("\xe2\x80\x89", " ")
330
+ clean_html.gsub!("\xe2\x80\x99", "'")
331
+ clean_html.gsub!("\xe2\x80\x98", "'")
332
+ clean_html.gsub!("\xe2\x80\x9c", '"')
333
+ clean_html.gsub!("\xe2\x80\x9d", '"')
334
+ clean_html.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
335
+
336
+ @content[[clean, index]] = clean_html
337
+ end
338
+
339
+ def sentences(qty = 3)
340
+ # ap content
341
+ clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
342
+ #ap clean_content
343
+ #exit
344
+ fodder = ''
345
+ doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
346
+
347
+ doc.traverse do |el|
348
+ path_segments = el.path.scan(/[a-z]+/)[2..-1]
349
+ next unless path_segments && path_segments.length > 1
350
+ if el.text? && el.text.strip.length < 3
351
+ el.remove
352
+ next
353
+ end
354
+ if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
355
+ text = el.text.strip
356
+ text += "." if text !~ /[\.\!\?\"\']$/
357
+ fodder += text + "\n"
358
+ end
359
+ end
360
+
361
+ fodder = content(true) if fodder.to_s.length < 50
362
+ fodder.gsub!(/\b\w\W\s/, '')
363
+
364
+ sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
365
+
366
+ sentences.compact!
367
+ sentences.map! { |s| s.strip }
368
+ sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
369
+ sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
370
+ sentences.map! { |s| s.gsub(/\s+/m, ' ') }
371
+ sentences.first(qty)
372
+ end
373
+
374
+ def images(qty = 3)
375
+ doc = Nokogiri::HTML(content, nil, 'utf-8')
376
+ images = []
377
+ doc.css("img").each do |img|
378
+ images << img['src']
379
+ break if images.length == qty
380
+ end
381
+ images
382
+ end
383
+
384
+ # Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
385
+ def strip(s)
386
+ s.gsub(/^\s+/, '').gsub(/\s+$/, '')
387
+ end
388
+ end
389
+ end
390
+ end
data/lib/pismo.rb CHANGED
@@ -4,11 +4,12 @@ require 'open-uri'
4
4
  require 'nokogiri'
5
5
  require 'fast_stemmer'
6
6
  require 'chronic'
7
+ require 'sanitize'
7
8
  require 'tempfile'
8
9
 
9
10
  $: << File.dirname(__FILE__)
10
11
  require 'pismo/document'
11
- require 'pismo/readability'
12
+ require 'pismo/reader'
12
13
 
13
14
  module Pismo
14
15
  # Sugar methods to make creating document objects nicer
@@ -20,7 +21,7 @@ module Pismo
20
21
  # (mostly useful for debugging use)
21
22
  def self.[](url)
22
23
  @docs ||= {}
23
- @docs[url] ||= Pismo::Document.new(open(url))
24
+ @docs[url] ||= Pismo::Document.new(url)
24
25
  end
25
26
 
26
27
 
data/pismo.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{pismo}
8
- s.version = "0.5.0"
8
+ s.version = "0.6.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Peter Cooper"]
12
- s.date = %q{2010-06-01}
12
+ s.date = %q{2010-06-20}
13
13
  s.default_executable = %q{pismo}
14
14
  s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
15
15
  s.email = %q{git@peterc.org}
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
22
22
  ".document",
23
23
  ".gitignore",
24
24
  "LICENSE",
25
+ "NOTICE",
25
26
  "README.markdown",
26
27
  "Rakefile",
27
28
  "VERSION",
@@ -30,62 +31,69 @@ Gem::Specification.new do |s|
30
31
  "lib/pismo/document.rb",
31
32
  "lib/pismo/external_attributes.rb",
32
33
  "lib/pismo/internal_attributes.rb",
33
- "lib/pismo/readability.rb",
34
+ "lib/pismo/reader.rb",
34
35
  "lib/pismo/stopwords.txt",
35
36
  "pismo.gemspec",
36
37
  "test/corpus/bbcnews.html",
38
+ "test/corpus/bbcnews2.html",
37
39
  "test/corpus/briancray.html",
38
40
  "test/corpus/cant_read.html",
39
41
  "test/corpus/factor.html",
42
+ "test/corpus/gmane.html",
40
43
  "test/corpus/huffington.html",
41
44
  "test/corpus/metadata_expected.yaml",
42
45
  "test/corpus/metadata_expected.yaml.old",
46
+ "test/corpus/queness.html",
47
+ "test/corpus/reader_expected.yaml",
43
48
  "test/corpus/rubyinside.html",
44
49
  "test/corpus/rww.html",
45
50
  "test/corpus/spolsky.html",
46
51
  "test/corpus/techcrunch.html",
52
+ "test/corpus/tweet.html",
47
53
  "test/corpus/youtube.html",
54
+ "test/corpus/zefrank.html",
48
55
  "test/helper.rb",
49
56
  "test/test_corpus.rb",
50
- "test/test_pismo_document.rb",
51
- "test/test_readability.rb"
57
+ "test/test_pismo_document.rb"
52
58
  ]
53
59
  s.homepage = %q{http://github.com/peterc/pismo}
54
60
  s.rdoc_options = ["--charset=UTF-8"]
55
61
  s.require_paths = ["lib"]
56
- s.rubygems_version = %q{1.3.5}
62
+ s.rubygems_version = %q{1.3.7}
57
63
  s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
58
64
  s.test_files = [
59
65
  "test/helper.rb",
60
66
  "test/test_corpus.rb",
61
- "test/test_pismo_document.rb",
62
- "test/test_readability.rb"
67
+ "test/test_pismo_document.rb"
63
68
  ]
64
69
 
65
70
  if s.respond_to? :specification_version then
66
71
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
67
72
  s.specification_version = 3
68
73
 
69
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
74
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
70
75
  s.add_development_dependency(%q<shoulda>, [">= 0"])
76
+ s.add_development_dependency(%q<awesome_print>, [">= 0"])
77
+ s.add_runtime_dependency(%q<jeweler>, [">= 0"])
71
78
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
72
- s.add_runtime_dependency(%q<loofah>, [">= 0"])
73
- s.add_runtime_dependency(%q<httparty>, [">= 0"])
79
+ s.add_runtime_dependency(%q<sanitize>, [">= 0"])
74
80
  s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
75
81
  s.add_runtime_dependency(%q<chronic>, [">= 0"])
76
82
  else
77
83
  s.add_dependency(%q<shoulda>, [">= 0"])
84
+ s.add_dependency(%q<awesome_print>, [">= 0"])
85
+ s.add_dependency(%q<jeweler>, [">= 0"])
78
86
  s.add_dependency(%q<nokogiri>, [">= 0"])
79
- s.add_dependency(%q<loofah>, [">= 0"])
80
- s.add_dependency(%q<httparty>, [">= 0"])
87
+ s.add_dependency(%q<sanitize>, [">= 0"])
81
88
  s.add_dependency(%q<fast-stemmer>, [">= 0"])
82
89
  s.add_dependency(%q<chronic>, [">= 0"])
83
90
  end
84
91
  else
85
92
  s.add_dependency(%q<shoulda>, [">= 0"])
93
+ s.add_dependency(%q<awesome_print>, [">= 0"])
94
+ s.add_dependency(%q<jeweler>, [">= 0"])
86
95
  s.add_dependency(%q<nokogiri>, [">= 0"])
87
- s.add_dependency(%q<loofah>, [">= 0"])
88
- s.add_dependency(%q<httparty>, [">= 0"])
96
+ s.add_dependency(%q<sanitize>, [">= 0"])
89
97
  s.add_dependency(%q<fast-stemmer>, [">= 0"])
90
98
  s.add_dependency(%q<chronic>, [">= 0"])
91
99
  end