dq-readability 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MjQ0NzVlMmZjMjM1ZjI2MzI1NjJiZTBmYjBjNjllYzFkMDZiNmQ4OQ==
5
+ data.tar.gz: !binary |-
6
+ MWM2MzE4ZDgyOTZhNTVhYWE3YzgxOTdmY2RiZWRkZWIzZGQzMDU5Zg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ MGNkNjRmMzgzMWE4NzZjNzg0MDVhYzI0NzJmY2FmMmZiZTAwYTIxYmMyNzE0
10
+ OTE4ODFiODg1ZjBhNTcxMTYxNTk3MzdkMmE0MjcwN2E3MzY1MzU5NDY5M2Fh
11
+ ODNmYTIwYWU3MjYyZWE1YWUyODI0NGUzZWVjN2E5MTYyNjEyZTI=
12
+ data.tar.gz: !binary |-
13
+ OWQ1OWZiYTFmYTUwODk1YjZjZGM1ZDcxOWQwNWIxMDI3ZjRjZTc3YjUzYzY2
14
+ ZTU5OTEwYzViMmNhMzdkOWQxOTlmNjJlODEwMGUzNDdiZjY5YmU2YzNjYWQ3
15
+ MDRmNjYyNWU4NGYxMWMxOThiZDJmYjIyZGNkMjgzZDI3MDU2MTM=
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "dq-readability"
6
- s.version = '1.0.1'
6
+ s.version = '1.0.2'
7
7
  s.authors = ["Prateek Papriwal"]
8
8
  s.email = ["papriwalprateek@gmail.com"]
9
9
  s.homepage = "http://github.com/DaQwest/dq-readability"
@@ -3,9 +3,11 @@
3
3
  require 'rubygems'
4
4
  require 'nokogiri'
5
5
  require 'guess_html_encoding'
6
+ require 'open-uri'
6
7
 
7
8
  module DQReadability
8
9
  class Document
10
+
9
11
  DEFAULT_OPTIONS = {
10
12
  :retry_length => 250,
11
13
  :min_text_length => 25,
@@ -36,8 +38,9 @@ module DQReadability
36
38
 
37
39
  def initialize(input, options = {})
38
40
  @options = DEFAULT_OPTIONS.merge(options)
39
- @input = input
40
-
41
+ @input = open(input).read
42
+ @url = input
43
+
41
44
  if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
42
45
  @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
43
46
  @options[:encoding] = @input.encoding.to_s
@@ -67,6 +70,72 @@ module DQReadability
67
70
 
68
71
  # Remove html comment tags
69
72
  @html.xpath('//comment()').each { |i| i.remove }
73
+
74
+ # making all the headings of same format
75
+ @html.css("h1").each do |h|
76
+ h.name = "h2"
77
+ end
78
+
79
+ @html.css("h2").each do |h|
80
+ h.name = "h3"
81
+ end
82
+
83
+ @html.css("h4").each do |h|
84
+ h.name = "h3"
85
+ end
86
+
87
+ uri = URI.parse(@url)
88
+ host = uri.host
89
+ scheme = uri.scheme
90
+ port = uri.port # defaults to 80
91
+ base = "#{scheme}://#{host}:#{port}/"
92
+
93
+
94
+
95
+ # changing img src
96
+ @html.css("img").each do |elem|
97
+ begin
98
+ if elem['src'][0] == '/'
99
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
100
+ else
101
+ if @url.split('').last == '/'
102
+ elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
103
+ else
104
+ x = @url.split('/')
105
+ x.delete_at(x.length-1)
106
+ y = ''
107
+ x.each{|i| y += i+'/'}
108
+ elem['src'] = URI.join(y,elem['src']).to_s if URI.parse(elem['src']).host == nil
109
+ end
110
+ end
111
+ rescue
112
+ elem.remove
113
+ end
114
+ end
115
+
116
+ #changing the 'a' href
117
+
118
+ @html.css("a").each do |elem|
119
+ begin
120
+ if elem['href'][0] == '/'
121
+ elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil
122
+ else
123
+ if @url.split('').last == '/'
124
+ elem['href'] = URI.join(@url,elem['href']).to_s if URI.parse(elem['href']).host == nil
125
+ else
126
+ x = @url.split('/')
127
+ x.delete_at(x.length-1)
128
+ y = ''
129
+ x.each{|i| y += i+'/'}
130
+ elem['href'] = URI.join(y,elem['href']).to_s if URI.parse(elem['href']).host == nil
131
+ end
132
+ end
133
+ rescue
134
+ elem['href'] = ""
135
+ end
136
+ end
137
+
138
+
70
139
  end
71
140
 
72
141
  def images(content=nil, reload=false)
@@ -407,12 +476,12 @@ module DQReadability
407
476
  elem.remove
408
477
  end
409
478
 
410
- if @options[:remove_empty_nodes]
411
- # remove <p> tags that have no text content - this will also remove p tags that contain only images.
412
- node.css("p").each do |elem|
413
- elem.remove if elem.content.strip.empty?
414
- end
415
- end
479
+ # if @options[:remove_empty_nodes]
480
+ # # remove <p> tags that have no text content - this will also remove p tags that contain only images.
481
+ # node.css("p").each do |elem|
482
+ # elem.remove if elem.content.strip.empty?
483
+ # end
484
+ # end
416
485
 
417
486
  # Conditionally clean <table>s, <ul>s, and <div>s
418
487
  clean_conditionally(node, candidates, "table, ul, div")
@@ -481,28 +550,28 @@ module DQReadability
481
550
  to_remove = false
482
551
  reason = ""
483
552
 
484
- # if (counts["img"] > counts["p"]) && (counts["img"] > 1)
485
- # reason = "too many images"
486
- # to_remove = true
487
- # elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
488
- # reason = "more <li>s than <p>s"
489
- # to_remove = true
490
- # elsif counts["input"] > (counts["p"] / 3).to_i
491
- # reason = "less than 3x <p>s than <input>s"
492
- # to_remove = true
493
- # elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
494
- # reason = "too short a content length without a single image"
495
- # to_remove = true
496
- # elsif weight < 25 && link_density > 0.2
497
- # reason = "too many links for its weight (#{weight})"
498
- # to_remove = true
499
- # elsif weight >= 25 && link_density > 0.5
500
- # reason = "too many links for its weight (#{weight})"
501
- # to_remove = true
502
- # elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
503
- # reason = "<embed>s with too short a content length, or too many <embed>s"
504
- # to_remove = true
505
- # end
553
+ if (counts["img"] > counts["p"]) && (counts["img"] > 1)
554
+ reason = "too many images"
555
+ to_remove = true
556
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
557
+ reason = "more <li>s than <p>s"
558
+ to_remove = true
559
+ elsif counts["input"] > (counts["p"] / 3).to_i
560
+ reason = "less than 3x <p>s than <input>s"
561
+ to_remove = true
562
+ elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
563
+ reason = "too short a content length without a single image"
564
+ to_remove = true
565
+ elsif weight < 25 && link_density > 0.2
566
+ reason = "too many links for its weight (#{weight})"
567
+ to_remove = true
568
+ elsif weight >= 25 && link_density > 0.5
569
+ reason = "too many links for its weight (#{weight})"
570
+ to_remove = true
571
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
572
+ reason = "<embed>s with too short a content length, or too many <embed>s"
573
+ to_remove = true
574
+ end
506
575
 
507
576
  if to_remove
508
577
  debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
metadata CHANGED
@@ -1,71 +1,85 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dq-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
5
- prerelease:
4
+ version: 1.0.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Prateek Papriwal
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-01-22 00:00:00.000000000 Z
11
+ date: 2014-01-24 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
- requirement: &11035780 !ruby/object:Gem::Requirement
17
- none: false
15
+ requirement: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '2.8'
22
20
  type: :development
23
21
  prerelease: false
24
- version_requirements: *11035780
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '2.8'
25
27
  - !ruby/object:Gem::Dependency
26
28
  name: rspec-expectations
27
- requirement: &11034780 !ruby/object:Gem::Requirement
28
- none: false
29
+ requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - ! '>='
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.8'
33
34
  type: :development
34
35
  prerelease: false
35
- version_requirements: *11034780
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
36
41
  - !ruby/object:Gem::Dependency
37
42
  name: rr
38
- requirement: &11034020 !ruby/object:Gem::Requirement
39
- none: false
43
+ requirement: !ruby/object:Gem::Requirement
40
44
  requirements:
41
45
  - - ! '>='
42
46
  - !ruby/object:Gem::Version
43
47
  version: '1.0'
44
48
  type: :development
45
49
  prerelease: false
46
- version_requirements: *11034020
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
47
55
  - !ruby/object:Gem::Dependency
48
56
  name: nokogiri
49
- requirement: &11033240 !ruby/object:Gem::Requirement
50
- none: false
57
+ requirement: !ruby/object:Gem::Requirement
51
58
  requirements:
52
59
  - - ! '>='
53
60
  - !ruby/object:Gem::Version
54
61
  version: 1.4.2
55
62
  type: :runtime
56
63
  prerelease: false
57
- version_requirements: *11033240
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.4.2
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: guess_html_encoding
60
- requirement: &11032640 !ruby/object:Gem::Requirement
61
- none: false
71
+ requirement: !ruby/object:Gem::Requirement
62
72
  requirements:
63
73
  - - ! '>='
64
74
  - !ruby/object:Gem::Version
65
75
  version: 0.0.4
66
76
  type: :runtime
67
77
  prerelease: false
68
- version_requirements: *11032640
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 0.0.4
69
83
  description: Port of arc90's readability project to ruby. The base code is derived
70
84
  from https://github.com/cantino/ruby-readability
71
85
  email:
@@ -105,26 +119,25 @@ files:
105
119
  - spec/spec_helper.rb
106
120
  homepage: http://github.com/DaQwest/dq-readability
107
121
  licenses: []
122
+ metadata: {}
108
123
  post_install_message:
109
124
  rdoc_options: []
110
125
  require_paths:
111
126
  - lib
112
127
  required_ruby_version: !ruby/object:Gem::Requirement
113
- none: false
114
128
  requirements:
115
129
  - - ! '>='
116
130
  - !ruby/object:Gem::Version
117
131
  version: '0'
118
132
  required_rubygems_version: !ruby/object:Gem::Requirement
119
- none: false
120
133
  requirements:
121
134
  - - ! '>='
122
135
  - !ruby/object:Gem::Version
123
136
  version: '0'
124
137
  requirements: []
125
138
  rubyforge_project: dq-readability
126
- rubygems_version: 1.8.11
139
+ rubygems_version: 2.1.11
127
140
  signing_key:
128
- specification_version: 3
141
+ specification_version: 4
129
142
  summary: Port of arc90's readability project to ruby
130
143
  test_files: []