dq-readability 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +18 -0
- data/dq-readability.gemspec +2 -2
- data/lib/dq-readability.rb +18 -5
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmU4OWIxYWQwODAwMzUzMjliNGNhZWE3MDgxZmZjM2ZhNDE3MmRhMw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MDhkMDhiMzIxNjZmZjg0ZjkyZmM5NTlkYmRkYTc1NzZiNTU2YjZlZA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODE2OGQ5YjNkYzhjNGM0NjJlOWM4YjM4MWE1ZWE4MjgzODlmMTE3ZjZkYzc0
|
10
|
+
OGU2YmMwOTc2YTBhYWRjMWZkN2MzZTRjODM0Njk2ZGQ1ZjQ5ZDI1ZmZhNDc2
|
11
|
+
ZDVmMzJiNzljMzdjN2IzYzFiZmEzMmNmYTJlMDhlNjVmNTRiMTY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NGExMDk0MjkyZDRiZjJiYzhmYTk3MDUyMzU2NTNhNTdlMzZlNzQwNWZiZWZh
|
14
|
+
YTIzYzBmNGVkNzc3NThjYTBiMDA5NWRmMDQ3OWQzMzY4ODJiYTE3ZDNiNmM3
|
15
|
+
ZmZkZTllZjkwYzg4Njc5NTA1YjczNzhlMTllYzgxNDgyMzkzNTI=
|
data/README.md
CHANGED
@@ -1 +1,19 @@
|
|
1
|
+
Install
|
2
|
+
-------
|
3
|
+
Command line:
|
4
|
+
|
5
|
+
(sudo) gem install dq-readability
|
6
|
+
|
7
|
+
Bundler:
|
8
|
+
|
9
|
+
gem "dq-readability"
|
10
|
+
Example
|
11
|
+
-------
|
12
|
+
require 'rubygems'
|
13
|
+
require 'dq-readability'
|
14
|
+
source = "http://www.personal.kent.edu/~rmuhamma/Algorithms/MyAlgorithms/Sorting/radixSort.htm"
|
15
|
+
puts DQReadability::Document.new(source,:tags=>%w[div pre p h1 h2 h3 h4 td table tr b a img br li ul ol center br hr blockquote em strong sub sup font tbody span],:attributes=>%w[href src align width color height]).content
|
16
|
+
|
17
|
+
|
18
|
+
|
1
19
|
|
data/dq-readability.gemspec
CHANGED
@@ -3,12 +3,12 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "dq-readability"
|
6
|
-
s.version = '1.0.
|
6
|
+
s.version = '1.0.3'
|
7
7
|
s.authors = ["Prateek Papriwal"]
|
8
8
|
s.email = ["papriwalprateek@gmail.com"]
|
9
9
|
s.homepage = "http://github.com/DaQwest/dq-readability"
|
10
10
|
s.summary = %q{Port of arc90's readability project to ruby}
|
11
|
-
s.description = %q{
|
11
|
+
s.description = %q{Extracts main content of the webpage. Presents in good readable format.}
|
12
12
|
|
13
13
|
s.rubyforge_project = "dq-readability"
|
14
14
|
|
data/lib/dq-readability.rb
CHANGED
@@ -96,7 +96,11 @@ module DQReadability
|
|
96
96
|
@html.css("img").each do |elem|
|
97
97
|
begin
|
98
98
|
if elem['src'][0] == '/'
|
99
|
-
elem['src']
|
99
|
+
if elem['src'][1] == '/'
|
100
|
+
elem['src'] = 'http:'+elem['src']
|
101
|
+
else
|
102
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
103
|
+
end
|
100
104
|
else
|
101
105
|
if @url.split('').last == '/'
|
102
106
|
elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
@@ -113,7 +117,16 @@ module DQReadability
|
|
113
117
|
end
|
114
118
|
end
|
115
119
|
|
116
|
-
|
120
|
+
# changing certain tags to <p> tags
|
121
|
+
|
122
|
+
x = @html.css("ol")
|
123
|
+
x.each do |t|
|
124
|
+
t.name = "p"
|
125
|
+
end
|
126
|
+
len = @html.css('ol').length
|
127
|
+
debug("length of ol tag #{len}")
|
128
|
+
|
129
|
+
#changing the 'a' href
|
117
130
|
|
118
131
|
@html.css("a").each do |elem|
|
119
132
|
begin
|
@@ -525,7 +538,7 @@ module DQReadability
|
|
525
538
|
html = node.serialize(:save_with => save_opts)
|
526
539
|
|
527
540
|
# Get rid of duplicate whitespace
|
528
|
-
return html.gsub(/[\r\n\f]+/, "\n" )
|
541
|
+
return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
|
529
542
|
end
|
530
543
|
|
531
544
|
def clean_conditionally(node, candidates, selector)
|
@@ -550,10 +563,10 @@ module DQReadability
|
|
550
563
|
to_remove = false
|
551
564
|
reason = ""
|
552
565
|
|
553
|
-
if (counts["img"] > counts["p"])
|
566
|
+
if (counts["img"] > counts["p"]+2)
|
554
567
|
reason = "too many images"
|
555
568
|
to_remove = true
|
556
|
-
|
569
|
+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
557
570
|
reason = "more <li>s than <p>s"
|
558
571
|
to_remove = true
|
559
572
|
elsif counts["input"] > (counts["p"] / 3).to_i
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dq-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Prateek Papriwal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -80,8 +80,7 @@ dependencies:
|
|
80
80
|
- - ! '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.0.4
|
83
|
-
description:
|
84
|
-
from https://github.com/cantino/ruby-readability
|
83
|
+
description: Extracts main content of the webpage. Presents in good readable format.
|
85
84
|
email:
|
86
85
|
- papriwalprateek@gmail.com
|
87
86
|
executables:
|