dq-readability 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +18 -0
- data/dq-readability.gemspec +2 -2
- data/lib/dq-readability.rb +18 -5
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmU4OWIxYWQwODAwMzUzMjliNGNhZWE3MDgxZmZjM2ZhNDE3MmRhMw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MDhkMDhiMzIxNjZmZjg0ZjkyZmM5NTlkYmRkYTc1NzZiNTU2YjZlZA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODE2OGQ5YjNkYzhjNGM0NjJlOWM4YjM4MWE1ZWE4MjgzODlmMTE3ZjZkYzc0
|
10
|
+
OGU2YmMwOTc2YTBhYWRjMWZkN2MzZTRjODM0Njk2ZGQ1ZjQ5ZDI1ZmZhNDc2
|
11
|
+
ZDVmMzJiNzljMzdjN2IzYzFiZmEzMmNmYTJlMDhlNjVmNTRiMTY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NGExMDk0MjkyZDRiZjJiYzhmYTk3MDUyMzU2NTNhNTdlMzZlNzQwNWZiZWZh
|
14
|
+
YTIzYzBmNGVkNzc3NThjYTBiMDA5NWRmMDQ3OWQzMzY4ODJiYTE3ZDNiNmM3
|
15
|
+
ZmZkZTllZjkwYzg4Njc5NTA1YjczNzhlMTllYzgxNDgyMzkzNTI=
|
data/README.md
CHANGED
@@ -1 +1,19 @@
|
|
1
|
+
Install
|
2
|
+
-------
|
3
|
+
Command line:
|
4
|
+
|
5
|
+
(sudo) gem install dq-readability
|
6
|
+
|
7
|
+
Bundler:
|
8
|
+
|
9
|
+
gem "dq-readability"
|
10
|
+
Example
|
11
|
+
-------
|
12
|
+
require 'rubygems'
|
13
|
+
require 'dq-readability'
|
14
|
+
source = "http://www.personal.kent.edu/~rmuhamma/Algorithms/MyAlgorithms/Sorting/radixSort.htm"
|
15
|
+
puts DQReadability::Document.new(source,:tags=>%w[div pre p h1 h2 h3 h4 td table tr b a img br li ul ol center br hr blockquote em strong sub sup font tbody span],:attributes=>%w[href src align width color height]).content
|
16
|
+
|
17
|
+
|
18
|
+
|
1
19
|
|
data/dq-readability.gemspec
CHANGED
@@ -3,12 +3,12 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "dq-readability"
|
6
|
-
s.version = '1.0.
|
6
|
+
s.version = '1.0.3'
|
7
7
|
s.authors = ["Prateek Papriwal"]
|
8
8
|
s.email = ["papriwalprateek@gmail.com"]
|
9
9
|
s.homepage = "http://github.com/DaQwest/dq-readability"
|
10
10
|
s.summary = %q{Port of arc90's readability project to ruby}
|
11
|
-
s.description = %q{
|
11
|
+
s.description = %q{Extracts main content of the webpage. Presents in good readable format.}
|
12
12
|
|
13
13
|
s.rubyforge_project = "dq-readability"
|
14
14
|
|
data/lib/dq-readability.rb
CHANGED
@@ -96,7 +96,11 @@ module DQReadability
|
|
96
96
|
@html.css("img").each do |elem|
|
97
97
|
begin
|
98
98
|
if elem['src'][0] == '/'
|
99
|
-
elem['src']
|
99
|
+
if elem['src'][1] == '/'
|
100
|
+
elem['src'] = 'http:'+elem['src']
|
101
|
+
else
|
102
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
103
|
+
end
|
100
104
|
else
|
101
105
|
if @url.split('').last == '/'
|
102
106
|
elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
@@ -113,7 +117,16 @@ module DQReadability
|
|
113
117
|
end
|
114
118
|
end
|
115
119
|
|
116
|
-
|
120
|
+
# changing certain tags to <p> tags
|
121
|
+
|
122
|
+
x = @html.css("ol")
|
123
|
+
x.each do |t|
|
124
|
+
t.name = "p"
|
125
|
+
end
|
126
|
+
len = @html.css('ol').length
|
127
|
+
debug("length of ol tag #{len}")
|
128
|
+
|
129
|
+
#changing the 'a' href
|
117
130
|
|
118
131
|
@html.css("a").each do |elem|
|
119
132
|
begin
|
@@ -525,7 +538,7 @@ module DQReadability
|
|
525
538
|
html = node.serialize(:save_with => save_opts)
|
526
539
|
|
527
540
|
# Get rid of duplicate whitespace
|
528
|
-
return html.gsub(/[\r\n\f]+/, "\n" )
|
541
|
+
return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
|
529
542
|
end
|
530
543
|
|
531
544
|
def clean_conditionally(node, candidates, selector)
|
@@ -550,10 +563,10 @@ module DQReadability
|
|
550
563
|
to_remove = false
|
551
564
|
reason = ""
|
552
565
|
|
553
|
-
if (counts["img"] > counts["p"])
|
566
|
+
if (counts["img"] > counts["p"]+2)
|
554
567
|
reason = "too many images"
|
555
568
|
to_remove = true
|
556
|
-
|
569
|
+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
557
570
|
reason = "more <li>s than <p>s"
|
558
571
|
to_remove = true
|
559
572
|
elsif counts["input"] > (counts["p"] / 3).to_i
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dq-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Prateek Papriwal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -80,8 +80,7 @@ dependencies:
|
|
80
80
|
- - ! '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.0.4
|
83
|
-
description:
|
84
|
-
from https://github.com/cantino/ruby-readability
|
83
|
+
description: Extracts main content of the webpage. Presents in good readable format.
|
85
84
|
email:
|
86
85
|
- papriwalprateek@gmail.com
|
87
86
|
executables:
|