dq-readability 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjQ0NzVlMmZjMjM1ZjI2MzI1NjJiZTBmYjBjNjllYzFkMDZiNmQ4OQ==
4
+ MmU4OWIxYWQwODAwMzUzMjliNGNhZWE3MDgxZmZjM2ZhNDE3MmRhMw==
5
5
  data.tar.gz: !binary |-
6
- MWM2MzE4ZDgyOTZhNTVhYWE3YzgxOTdmY2RiZWRkZWIzZGQzMDU5Zg==
6
+ MDhkMDhiMzIxNjZmZjg0ZjkyZmM5NTlkYmRkYTc1NzZiNTU2YjZlZA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MGNkNjRmMzgzMWE4NzZjNzg0MDVhYzI0NzJmY2FmMmZiZTAwYTIxYmMyNzE0
10
- OTE4ODFiODg1ZjBhNTcxMTYxNTk3MzdkMmE0MjcwN2E3MzY1MzU5NDY5M2Fh
11
- ODNmYTIwYWU3MjYyZWE1YWUyODI0NGUzZWVjN2E5MTYyNjEyZTI=
9
+ ODE2OGQ5YjNkYzhjNGM0NjJlOWM4YjM4MWE1ZWE4MjgzODlmMTE3ZjZkYzc0
10
+ OGU2YmMwOTc2YTBhYWRjMWZkN2MzZTRjODM0Njk2ZGQ1ZjQ5ZDI1ZmZhNDc2
11
+ ZDVmMzJiNzljMzdjN2IzYzFiZmEzMmNmYTJlMDhlNjVmNTRiMTY=
12
12
  data.tar.gz: !binary |-
13
- OWQ1OWZiYTFmYTUwODk1YjZjZGM1ZDcxOWQwNWIxMDI3ZjRjZTc3YjUzYzY2
14
- ZTU5OTEwYzViMmNhMzdkOWQxOTlmNjJlODEwMGUzNDdiZjY5YmU2YzNjYWQ3
15
- MDRmNjYyNWU4NGYxMWMxOThiZDJmYjIyZGNkMjgzZDI3MDU2MTM=
13
+ NGExMDk0MjkyZDRiZjJiYzhmYTk3MDUyMzU2NTNhNTdlMzZlNzQwNWZiZWZh
14
+ YTIzYzBmNGVkNzc3NThjYTBiMDA5NWRmMDQ3OWQzMzY4ODJiYTE3ZDNiNmM3
15
+ ZmZkZTllZjkwYzg4Njc5NTA1YjczNzhlMTllYzgxNDgyMzkzNTI=
data/README.md CHANGED
@@ -1 +1,19 @@
1
+ Install
2
+ -------
3
+ Command line:
4
+
5
+ (sudo) gem install dq-readability
6
+
7
+ Bundler:
8
+
9
+ gem "dq-readability"
10
+ Example
11
+ -------
12
+ require 'rubygems'
13
+ require 'dq-readability'
14
+ source = "http://www.personal.kent.edu/~rmuhamma/Algorithms/MyAlgorithms/Sorting/radixSort.htm"
15
+ puts DQReadability::Document.new(source,:tags=>%w[div pre p h1 h2 h3 h4 td table tr b a img br li ul ol center br hr blockquote em strong sub sup font tbody span],:attributes=>%w[href src align width color height]).content
16
+
17
+
18
+
1
19
 
@@ -3,12 +3,12 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "dq-readability"
6
- s.version = '1.0.2'
6
+ s.version = '1.0.3'
7
7
  s.authors = ["Prateek Papriwal"]
8
8
  s.email = ["papriwalprateek@gmail.com"]
9
9
  s.homepage = "http://github.com/DaQwest/dq-readability"
10
10
  s.summary = %q{Port of arc90's readability project to ruby}
11
- s.description = %q{Port of arc90's readability project to ruby. The base code is derived from https://github.com/cantino/ruby-readability}
11
+ s.description = %q{Extracts main content of the webpage. Presents in good readable format.}
12
12
 
13
13
  s.rubyforge_project = "dq-readability"
14
14
 
@@ -96,7 +96,11 @@ module DQReadability
96
96
  @html.css("img").each do |elem|
97
97
  begin
98
98
  if elem['src'][0] == '/'
99
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
99
+ if elem['src'][1] == '/'
100
+ elem['src'] = 'http:'+elem['src']
101
+ else
102
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
103
+ end
100
104
  else
101
105
  if @url.split('').last == '/'
102
106
  elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
@@ -113,7 +117,16 @@ module DQReadability
113
117
  end
114
118
  end
115
119
 
116
- #changing the 'a' href
120
+ # changing certain tags to <p> tags
121
+
122
+ x = @html.css("ol")
123
+ x.each do |t|
124
+ t.name = "p"
125
+ end
126
+ len = @html.css('ol').length
127
+ debug("length of ol tag #{len}")
128
+
129
+ #changing the 'a' href
117
130
 
118
131
  @html.css("a").each do |elem|
119
132
  begin
@@ -525,7 +538,7 @@ module DQReadability
525
538
  html = node.serialize(:save_with => save_opts)
526
539
 
527
540
  # Get rid of duplicate whitespace
528
- return html.gsub(/[\r\n\f]+/, "\n" )
541
+ return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
529
542
  end
530
543
 
531
544
  def clean_conditionally(node, candidates, selector)
@@ -550,10 +563,10 @@ module DQReadability
550
563
  to_remove = false
551
564
  reason = ""
552
565
 
553
- if (counts["img"] > counts["p"]) && (counts["img"] > 1)
566
+ if (counts["img"] > counts["p"]+2)
554
567
  reason = "too many images"
555
568
  to_remove = true
556
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
569
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
557
570
  reason = "more <li>s than <p>s"
558
571
  to_remove = true
559
572
  elsif counts["input"] > (counts["p"] / 3).to_i
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dq-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Prateek Papriwal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-24 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -80,8 +80,7 @@ dependencies:
80
80
  - - ! '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.0.4
83
- description: Port of arc90's readability project to ruby. The base code is derived
84
- from https://github.com/cantino/ruby-readability
83
+ description: Extracts main content of the webpage. Presents in good readable format.
85
84
  email:
86
85
  - papriwalprateek@gmail.com
87
86
  executables: