dq-readability 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjQ0NzVlMmZjMjM1ZjI2MzI1NjJiZTBmYjBjNjllYzFkMDZiNmQ4OQ==
4
+ MmU4OWIxYWQwODAwMzUzMjliNGNhZWE3MDgxZmZjM2ZhNDE3MmRhMw==
5
5
  data.tar.gz: !binary |-
6
- MWM2MzE4ZDgyOTZhNTVhYWE3YzgxOTdmY2RiZWRkZWIzZGQzMDU5Zg==
6
+ MDhkMDhiMzIxNjZmZjg0ZjkyZmM5NTlkYmRkYTc1NzZiNTU2YjZlZA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MGNkNjRmMzgzMWE4NzZjNzg0MDVhYzI0NzJmY2FmMmZiZTAwYTIxYmMyNzE0
10
- OTE4ODFiODg1ZjBhNTcxMTYxNTk3MzdkMmE0MjcwN2E3MzY1MzU5NDY5M2Fh
11
- ODNmYTIwYWU3MjYyZWE1YWUyODI0NGUzZWVjN2E5MTYyNjEyZTI=
9
+ ODE2OGQ5YjNkYzhjNGM0NjJlOWM4YjM4MWE1ZWE4MjgzODlmMTE3ZjZkYzc0
10
+ OGU2YmMwOTc2YTBhYWRjMWZkN2MzZTRjODM0Njk2ZGQ1ZjQ5ZDI1ZmZhNDc2
11
+ ZDVmMzJiNzljMzdjN2IzYzFiZmEzMmNmYTJlMDhlNjVmNTRiMTY=
12
12
  data.tar.gz: !binary |-
13
- OWQ1OWZiYTFmYTUwODk1YjZjZGM1ZDcxOWQwNWIxMDI3ZjRjZTc3YjUzYzY2
14
- ZTU5OTEwYzViMmNhMzdkOWQxOTlmNjJlODEwMGUzNDdiZjY5YmU2YzNjYWQ3
15
- MDRmNjYyNWU4NGYxMWMxOThiZDJmYjIyZGNkMjgzZDI3MDU2MTM=
13
+ NGExMDk0MjkyZDRiZjJiYzhmYTk3MDUyMzU2NTNhNTdlMzZlNzQwNWZiZWZh
14
+ YTIzYzBmNGVkNzc3NThjYTBiMDA5NWRmMDQ3OWQzMzY4ODJiYTE3ZDNiNmM3
15
+ ZmZkZTllZjkwYzg4Njc5NTA1YjczNzhlMTllYzgxNDgyMzkzNTI=
data/README.md CHANGED
@@ -1 +1,19 @@
1
+ Install
2
+ -------
3
+ Command line:
4
+
5
+ (sudo) gem install dq-readability
6
+
7
+ Bundler:
8
+
9
+ gem "dq-readability"
10
+ Example
11
+ -------
12
+ require 'rubygems'
13
+ require 'dq-readability'
14
+ source = "http://www.personal.kent.edu/~rmuhamma/Algorithms/MyAlgorithms/Sorting/radixSort.htm"
15
+ puts DQReadability::Document.new(source,:tags=>%w[div pre p h1 h2 h3 h4 td table tr b a img br li ul ol center br hr blockquote em strong sub sup font tbody span],:attributes=>%w[href src align width color height]).content
16
+
17
+
18
+
1
19
 
@@ -3,12 +3,12 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "dq-readability"
6
- s.version = '1.0.2'
6
+ s.version = '1.0.3'
7
7
  s.authors = ["Prateek Papriwal"]
8
8
  s.email = ["papriwalprateek@gmail.com"]
9
9
  s.homepage = "http://github.com/DaQwest/dq-readability"
10
10
  s.summary = %q{Port of arc90's readability project to ruby}
11
- s.description = %q{Port of arc90's readability project to ruby. The base code is derived from https://github.com/cantino/ruby-readability}
11
+ s.description = %q{Extracts main content of the webpage. Presents in good readable format.}
12
12
 
13
13
  s.rubyforge_project = "dq-readability"
14
14
 
@@ -96,7 +96,11 @@ module DQReadability
96
96
  @html.css("img").each do |elem|
97
97
  begin
98
98
  if elem['src'][0] == '/'
99
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
99
+ if elem['src'][1] == '/'
100
+ elem['src'] = 'http:'+elem['src']
101
+ else
102
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
103
+ end
100
104
  else
101
105
  if @url.split('').last == '/'
102
106
  elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
@@ -113,7 +117,16 @@ module DQReadability
113
117
  end
114
118
  end
115
119
 
116
- #changing the 'a' href
120
+ # changing certain tags to <p> tags
121
+
122
+ x = @html.css("ol")
123
+ x.each do |t|
124
+ t.name = "p"
125
+ end
126
+ len = @html.css('ol').length
127
+ debug("length of ol tag #{len}")
128
+
129
+ #changing the 'a' href
117
130
 
118
131
  @html.css("a").each do |elem|
119
132
  begin
@@ -525,7 +538,7 @@ module DQReadability
525
538
  html = node.serialize(:save_with => save_opts)
526
539
 
527
540
  # Get rid of duplicate whitespace
528
- return html.gsub(/[\r\n\f]+/, "\n" )
541
+ return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
529
542
  end
530
543
 
531
544
  def clean_conditionally(node, candidates, selector)
@@ -550,10 +563,10 @@ module DQReadability
550
563
  to_remove = false
551
564
  reason = ""
552
565
 
553
- if (counts["img"] > counts["p"]) && (counts["img"] > 1)
566
+ if (counts["img"] > counts["p"]+2)
554
567
  reason = "too many images"
555
568
  to_remove = true
556
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
569
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
557
570
  reason = "more <li>s than <p>s"
558
571
  to_remove = true
559
572
  elsif counts["input"] > (counts["p"] / 3).to_i
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dq-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Prateek Papriwal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-24 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -80,8 +80,7 @@ dependencies:
80
80
  - - ! '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.0.4
83
- description: Port of arc90's readability project to ruby. The base code is derived
84
- from https://github.com/cantino/ruby-readability
83
+ description: Extracts main content of the webpage. Presents in good readable format.
85
84
  email:
86
85
  - papriwalprateek@gmail.com
87
86
  executables: