dq-readability 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/dq-readability.gemspec +1 -1
- data/lib/dq-readability.rb +99 -30
- metadata +35 -22
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MjQ0NzVlMmZjMjM1ZjI2MzI1NjJiZTBmYjBjNjllYzFkMDZiNmQ4OQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MWM2MzE4ZDgyOTZhNTVhYWE3YzgxOTdmY2RiZWRkZWIzZGQzMDU5Zg==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MGNkNjRmMzgzMWE4NzZjNzg0MDVhYzI0NzJmY2FmMmZiZTAwYTIxYmMyNzE0
|
10
|
+
OTE4ODFiODg1ZjBhNTcxMTYxNTk3MzdkMmE0MjcwN2E3MzY1MzU5NDY5M2Fh
|
11
|
+
ODNmYTIwYWU3MjYyZWE1YWUyODI0NGUzZWVjN2E5MTYyNjEyZTI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OWQ1OWZiYTFmYTUwODk1YjZjZGM1ZDcxOWQwNWIxMDI3ZjRjZTc3YjUzYzY2
|
14
|
+
ZTU5OTEwYzViMmNhMzdkOWQxOTlmNjJlODEwMGUzNDdiZjY5YmU2YzNjYWQ3
|
15
|
+
MDRmNjYyNWU4NGYxMWMxOThiZDJmYjIyZGNkMjgzZDI3MDU2MTM=
|
data/dq-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "dq-readability"
|
6
|
-
s.version = '1.0.
|
6
|
+
s.version = '1.0.2'
|
7
7
|
s.authors = ["Prateek Papriwal"]
|
8
8
|
s.email = ["papriwalprateek@gmail.com"]
|
9
9
|
s.homepage = "http://github.com/DaQwest/dq-readability"
|
data/lib/dq-readability.rb
CHANGED
@@ -3,9 +3,11 @@
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'guess_html_encoding'
|
6
|
+
require 'open-uri'
|
6
7
|
|
7
8
|
module DQReadability
|
8
9
|
class Document
|
10
|
+
|
9
11
|
DEFAULT_OPTIONS = {
|
10
12
|
:retry_length => 250,
|
11
13
|
:min_text_length => 25,
|
@@ -36,8 +38,9 @@ module DQReadability
|
|
36
38
|
|
37
39
|
def initialize(input, options = {})
|
38
40
|
@options = DEFAULT_OPTIONS.merge(options)
|
39
|
-
@input = input
|
40
|
-
|
41
|
+
@input = open(input).read
|
42
|
+
@url = input
|
43
|
+
|
41
44
|
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
|
42
45
|
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
|
43
46
|
@options[:encoding] = @input.encoding.to_s
|
@@ -67,6 +70,72 @@ module DQReadability
|
|
67
70
|
|
68
71
|
# Remove html comment tags
|
69
72
|
@html.xpath('//comment()').each { |i| i.remove }
|
73
|
+
|
74
|
+
# making all the headings of same format
|
75
|
+
@html.css("h1").each do |h|
|
76
|
+
h.name = "h2"
|
77
|
+
end
|
78
|
+
|
79
|
+
@html.css("h2").each do |h|
|
80
|
+
h.name = "h3"
|
81
|
+
end
|
82
|
+
|
83
|
+
@html.css("h4").each do |h|
|
84
|
+
h.name = "h3"
|
85
|
+
end
|
86
|
+
|
87
|
+
uri = URI.parse(@url)
|
88
|
+
host = uri.host
|
89
|
+
scheme = uri.scheme
|
90
|
+
port = uri.port # defaults to 80
|
91
|
+
base = "#{scheme}://#{host}:#{port}/"
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
# changing img src
|
96
|
+
@html.css("img").each do |elem|
|
97
|
+
begin
|
98
|
+
if elem['src'][0] == '/'
|
99
|
+
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
100
|
+
else
|
101
|
+
if @url.split('').last == '/'
|
102
|
+
elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
103
|
+
else
|
104
|
+
x = @url.split('/')
|
105
|
+
x.delete_at(x.length-1)
|
106
|
+
y = ''
|
107
|
+
x.each{|i| y += i+'/'}
|
108
|
+
elem['src'] = URI.join(y,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
rescue
|
112
|
+
elem.remove
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
#changing the 'a' href
|
117
|
+
|
118
|
+
@html.css("a").each do |elem|
|
119
|
+
begin
|
120
|
+
if elem['href'][0] == '/'
|
121
|
+
elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil
|
122
|
+
else
|
123
|
+
if @url.split('').last == '/'
|
124
|
+
elem['href'] = URI.join(@url,elem['href']).to_s if URI.parse(elem['href']).host == nil
|
125
|
+
else
|
126
|
+
x = @url.split('/')
|
127
|
+
x.delete_at(x.length-1)
|
128
|
+
y = ''
|
129
|
+
x.each{|i| y += i+'/'}
|
130
|
+
elem['href'] = URI.join(y,elem['href']).to_s if URI.parse(elem['href']).host == nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
rescue
|
134
|
+
elem['href'] = ""
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
|
70
139
|
end
|
71
140
|
|
72
141
|
def images(content=nil, reload=false)
|
@@ -407,12 +476,12 @@ module DQReadability
|
|
407
476
|
elem.remove
|
408
477
|
end
|
409
478
|
|
410
|
-
if @options[:remove_empty_nodes]
|
411
|
-
# remove <p> tags that have no text content - this will also remove p tags that contain only images.
|
412
|
-
node.css("p").each do |elem|
|
413
|
-
elem.remove if elem.content.strip.empty?
|
414
|
-
end
|
415
|
-
end
|
479
|
+
# if @options[:remove_empty_nodes]
|
480
|
+
# # remove <p> tags that have no text content - this will also remove p tags that contain only images.
|
481
|
+
# node.css("p").each do |elem|
|
482
|
+
# elem.remove if elem.content.strip.empty?
|
483
|
+
# end
|
484
|
+
# end
|
416
485
|
|
417
486
|
# Conditionally clean <table>s, <ul>s, and <div>s
|
418
487
|
clean_conditionally(node, candidates, "table, ul, div")
|
@@ -481,28 +550,28 @@ module DQReadability
|
|
481
550
|
to_remove = false
|
482
551
|
reason = ""
|
483
552
|
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
553
|
+
if (counts["img"] > counts["p"]) && (counts["img"] > 1)
|
554
|
+
reason = "too many images"
|
555
|
+
to_remove = true
|
556
|
+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
557
|
+
reason = "more <li>s than <p>s"
|
558
|
+
to_remove = true
|
559
|
+
elsif counts["input"] > (counts["p"] / 3).to_i
|
560
|
+
reason = "less than 3x <p>s than <input>s"
|
561
|
+
to_remove = true
|
562
|
+
elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
|
563
|
+
reason = "too short a content length without a single image"
|
564
|
+
to_remove = true
|
565
|
+
elsif weight < 25 && link_density > 0.2
|
566
|
+
reason = "too many links for its weight (#{weight})"
|
567
|
+
to_remove = true
|
568
|
+
elsif weight >= 25 && link_density > 0.5
|
569
|
+
reason = "too many links for its weight (#{weight})"
|
570
|
+
to_remove = true
|
571
|
+
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
572
|
+
reason = "<embed>s with too short a content length, or too many <embed>s"
|
573
|
+
to_remove = true
|
574
|
+
end
|
506
575
|
|
507
576
|
if to_remove
|
508
577
|
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
metadata
CHANGED
@@ -1,71 +1,85 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dq-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Prateek Papriwal
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-24 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
16
|
-
requirement:
|
17
|
-
none: false
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '2.8'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
|
-
version_requirements:
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.8'
|
25
27
|
- !ruby/object:Gem::Dependency
|
26
28
|
name: rspec-expectations
|
27
|
-
requirement:
|
28
|
-
none: false
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
29
30
|
requirements:
|
30
31
|
- - ! '>='
|
31
32
|
- !ruby/object:Gem::Version
|
32
33
|
version: '2.8'
|
33
34
|
type: :development
|
34
35
|
prerelease: false
|
35
|
-
version_requirements:
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.8'
|
36
41
|
- !ruby/object:Gem::Dependency
|
37
42
|
name: rr
|
38
|
-
requirement:
|
39
|
-
none: false
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
40
44
|
requirements:
|
41
45
|
- - ! '>='
|
42
46
|
- !ruby/object:Gem::Version
|
43
47
|
version: '1.0'
|
44
48
|
type: :development
|
45
49
|
prerelease: false
|
46
|
-
version_requirements:
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.0'
|
47
55
|
- !ruby/object:Gem::Dependency
|
48
56
|
name: nokogiri
|
49
|
-
requirement:
|
50
|
-
none: false
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
51
58
|
requirements:
|
52
59
|
- - ! '>='
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: 1.4.2
|
55
62
|
type: :runtime
|
56
63
|
prerelease: false
|
57
|
-
version_requirements:
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.4.2
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: guess_html_encoding
|
60
|
-
requirement:
|
61
|
-
none: false
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
62
72
|
requirements:
|
63
73
|
- - ! '>='
|
64
74
|
- !ruby/object:Gem::Version
|
65
75
|
version: 0.0.4
|
66
76
|
type: :runtime
|
67
77
|
prerelease: false
|
68
|
-
version_requirements:
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.0.4
|
69
83
|
description: Port of arc90's readability project to ruby. The base code is derived
|
70
84
|
from https://github.com/cantino/ruby-readability
|
71
85
|
email:
|
@@ -105,26 +119,25 @@ files:
|
|
105
119
|
- spec/spec_helper.rb
|
106
120
|
homepage: http://github.com/DaQwest/dq-readability
|
107
121
|
licenses: []
|
122
|
+
metadata: {}
|
108
123
|
post_install_message:
|
109
124
|
rdoc_options: []
|
110
125
|
require_paths:
|
111
126
|
- lib
|
112
127
|
required_ruby_version: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
128
|
requirements:
|
115
129
|
- - ! '>='
|
116
130
|
- !ruby/object:Gem::Version
|
117
131
|
version: '0'
|
118
132
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
119
|
-
none: false
|
120
133
|
requirements:
|
121
134
|
- - ! '>='
|
122
135
|
- !ruby/object:Gem::Version
|
123
136
|
version: '0'
|
124
137
|
requirements: []
|
125
138
|
rubyforge_project: dq-readability
|
126
|
-
rubygems_version: 1.
|
139
|
+
rubygems_version: 2.1.11
|
127
140
|
signing_key:
|
128
|
-
specification_version:
|
141
|
+
specification_version: 4
|
129
142
|
summary: Port of arc90's readability project to ruby
|
130
143
|
test_files: []
|