dq-readability 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MjQ0NzVlMmZjMjM1ZjI2MzI1NjJiZTBmYjBjNjllYzFkMDZiNmQ4OQ==
5
+ data.tar.gz: !binary |-
6
+ MWM2MzE4ZDgyOTZhNTVhYWE3YzgxOTdmY2RiZWRkZWIzZGQzMDU5Zg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ MGNkNjRmMzgzMWE4NzZjNzg0MDVhYzI0NzJmY2FmMmZiZTAwYTIxYmMyNzE0
10
+ OTE4ODFiODg1ZjBhNTcxMTYxNTk3MzdkMmE0MjcwN2E3MzY1MzU5NDY5M2Fh
11
+ ODNmYTIwYWU3MjYyZWE1YWUyODI0NGUzZWVjN2E5MTYyNjEyZTI=
12
+ data.tar.gz: !binary |-
13
+ OWQ1OWZiYTFmYTUwODk1YjZjZGM1ZDcxOWQwNWIxMDI3ZjRjZTc3YjUzYzY2
14
+ ZTU5OTEwYzViMmNhMzdkOWQxOTlmNjJlODEwMGUzNDdiZjY5YmU2YzNjYWQ3
15
+ MDRmNjYyNWU4NGYxMWMxOThiZDJmYjIyZGNkMjgzZDI3MDU2MTM=
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "dq-readability"
6
- s.version = '1.0.1'
6
+ s.version = '1.0.2'
7
7
  s.authors = ["Prateek Papriwal"]
8
8
  s.email = ["papriwalprateek@gmail.com"]
9
9
  s.homepage = "http://github.com/DaQwest/dq-readability"
@@ -3,9 +3,11 @@
3
3
  require 'rubygems'
4
4
  require 'nokogiri'
5
5
  require 'guess_html_encoding'
6
+ require 'open-uri'
6
7
 
7
8
  module DQReadability
8
9
  class Document
10
+
9
11
  DEFAULT_OPTIONS = {
10
12
  :retry_length => 250,
11
13
  :min_text_length => 25,
@@ -36,8 +38,9 @@ module DQReadability
36
38
 
37
39
  def initialize(input, options = {})
38
40
  @options = DEFAULT_OPTIONS.merge(options)
39
- @input = input
40
-
41
+ @input = open(input).read
42
+ @url = input
43
+
41
44
  if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
42
45
  @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
43
46
  @options[:encoding] = @input.encoding.to_s
@@ -67,6 +70,72 @@ module DQReadability
67
70
 
68
71
  # Remove html comment tags
69
72
  @html.xpath('//comment()').each { |i| i.remove }
73
+
74
+ # making all the headings of same format
75
+ @html.css("h1").each do |h|
76
+ h.name = "h2"
77
+ end
78
+
79
+ @html.css("h2").each do |h|
80
+ h.name = "h3"
81
+ end
82
+
83
+ @html.css("h4").each do |h|
84
+ h.name = "h3"
85
+ end
86
+
87
+ uri = URI.parse(@url)
88
+ host = uri.host
89
+ scheme = uri.scheme
90
+ port = uri.port # defaults to 80
91
+ base = "#{scheme}://#{host}:#{port}/"
92
+
93
+
94
+
95
+ # changing img src
96
+ @html.css("img").each do |elem|
97
+ begin
98
+ if elem['src'][0] == '/'
99
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
100
+ else
101
+ if @url.split('').last == '/'
102
+ elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
103
+ else
104
+ x = @url.split('/')
105
+ x.delete_at(x.length-1)
106
+ y = ''
107
+ x.each{|i| y += i+'/'}
108
+ elem['src'] = URI.join(y,elem['src']).to_s if URI.parse(elem['src']).host == nil
109
+ end
110
+ end
111
+ rescue
112
+ elem.remove
113
+ end
114
+ end
115
+
116
+ #changing the 'a' href
117
+
118
+ @html.css("a").each do |elem|
119
+ begin
120
+ if elem['href'][0] == '/'
121
+ elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil
122
+ else
123
+ if @url.split('').last == '/'
124
+ elem['href'] = URI.join(@url,elem['href']).to_s if URI.parse(elem['href']).host == nil
125
+ else
126
+ x = @url.split('/')
127
+ x.delete_at(x.length-1)
128
+ y = ''
129
+ x.each{|i| y += i+'/'}
130
+ elem['href'] = URI.join(y,elem['href']).to_s if URI.parse(elem['href']).host == nil
131
+ end
132
+ end
133
+ rescue
134
+ elem['href'] = ""
135
+ end
136
+ end
137
+
138
+
70
139
  end
71
140
 
72
141
  def images(content=nil, reload=false)
@@ -407,12 +476,12 @@ module DQReadability
407
476
  elem.remove
408
477
  end
409
478
 
410
- if @options[:remove_empty_nodes]
411
- # remove <p> tags that have no text content - this will also remove p tags that contain only images.
412
- node.css("p").each do |elem|
413
- elem.remove if elem.content.strip.empty?
414
- end
415
- end
479
+ # if @options[:remove_empty_nodes]
480
+ # # remove <p> tags that have no text content - this will also remove p tags that contain only images.
481
+ # node.css("p").each do |elem|
482
+ # elem.remove if elem.content.strip.empty?
483
+ # end
484
+ # end
416
485
 
417
486
  # Conditionally clean <table>s, <ul>s, and <div>s
418
487
  clean_conditionally(node, candidates, "table, ul, div")
@@ -481,28 +550,28 @@ module DQReadability
481
550
  to_remove = false
482
551
  reason = ""
483
552
 
484
- # if (counts["img"] > counts["p"]) && (counts["img"] > 1)
485
- # reason = "too many images"
486
- # to_remove = true
487
- # elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
488
- # reason = "more <li>s than <p>s"
489
- # to_remove = true
490
- # elsif counts["input"] > (counts["p"] / 3).to_i
491
- # reason = "less than 3x <p>s than <input>s"
492
- # to_remove = true
493
- # elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
494
- # reason = "too short a content length without a single image"
495
- # to_remove = true
496
- # elsif weight < 25 && link_density > 0.2
497
- # reason = "too many links for its weight (#{weight})"
498
- # to_remove = true
499
- # elsif weight >= 25 && link_density > 0.5
500
- # reason = "too many links for its weight (#{weight})"
501
- # to_remove = true
502
- # elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
503
- # reason = "<embed>s with too short a content length, or too many <embed>s"
504
- # to_remove = true
505
- # end
553
+ if (counts["img"] > counts["p"]) && (counts["img"] > 1)
554
+ reason = "too many images"
555
+ to_remove = true
556
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
557
+ reason = "more <li>s than <p>s"
558
+ to_remove = true
559
+ elsif counts["input"] > (counts["p"] / 3).to_i
560
+ reason = "less than 3x <p>s than <input>s"
561
+ to_remove = true
562
+ elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
563
+ reason = "too short a content length without a single image"
564
+ to_remove = true
565
+ elsif weight < 25 && link_density > 0.2
566
+ reason = "too many links for its weight (#{weight})"
567
+ to_remove = true
568
+ elsif weight >= 25 && link_density > 0.5
569
+ reason = "too many links for its weight (#{weight})"
570
+ to_remove = true
571
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
572
+ reason = "<embed>s with too short a content length, or too many <embed>s"
573
+ to_remove = true
574
+ end
506
575
 
507
576
  if to_remove
508
577
  debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
metadata CHANGED
@@ -1,71 +1,85 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dq-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
5
- prerelease:
4
+ version: 1.0.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Prateek Papriwal
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-01-22 00:00:00.000000000 Z
11
+ date: 2014-01-24 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
- requirement: &11035780 !ruby/object:Gem::Requirement
17
- none: false
15
+ requirement: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '2.8'
22
20
  type: :development
23
21
  prerelease: false
24
- version_requirements: *11035780
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '2.8'
25
27
  - !ruby/object:Gem::Dependency
26
28
  name: rspec-expectations
27
- requirement: &11034780 !ruby/object:Gem::Requirement
28
- none: false
29
+ requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - ! '>='
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.8'
33
34
  type: :development
34
35
  prerelease: false
35
- version_requirements: *11034780
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
36
41
  - !ruby/object:Gem::Dependency
37
42
  name: rr
38
- requirement: &11034020 !ruby/object:Gem::Requirement
39
- none: false
43
+ requirement: !ruby/object:Gem::Requirement
40
44
  requirements:
41
45
  - - ! '>='
42
46
  - !ruby/object:Gem::Version
43
47
  version: '1.0'
44
48
  type: :development
45
49
  prerelease: false
46
- version_requirements: *11034020
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
47
55
  - !ruby/object:Gem::Dependency
48
56
  name: nokogiri
49
- requirement: &11033240 !ruby/object:Gem::Requirement
50
- none: false
57
+ requirement: !ruby/object:Gem::Requirement
51
58
  requirements:
52
59
  - - ! '>='
53
60
  - !ruby/object:Gem::Version
54
61
  version: 1.4.2
55
62
  type: :runtime
56
63
  prerelease: false
57
- version_requirements: *11033240
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.4.2
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: guess_html_encoding
60
- requirement: &11032640 !ruby/object:Gem::Requirement
61
- none: false
71
+ requirement: !ruby/object:Gem::Requirement
62
72
  requirements:
63
73
  - - ! '>='
64
74
  - !ruby/object:Gem::Version
65
75
  version: 0.0.4
66
76
  type: :runtime
67
77
  prerelease: false
68
- version_requirements: *11032640
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 0.0.4
69
83
  description: Port of arc90's readability project to ruby. The base code is derived
70
84
  from https://github.com/cantino/ruby-readability
71
85
  email:
@@ -105,26 +119,25 @@ files:
105
119
  - spec/spec_helper.rb
106
120
  homepage: http://github.com/DaQwest/dq-readability
107
121
  licenses: []
122
+ metadata: {}
108
123
  post_install_message:
109
124
  rdoc_options: []
110
125
  require_paths:
111
126
  - lib
112
127
  required_ruby_version: !ruby/object:Gem::Requirement
113
- none: false
114
128
  requirements:
115
129
  - - ! '>='
116
130
  - !ruby/object:Gem::Version
117
131
  version: '0'
118
132
  required_rubygems_version: !ruby/object:Gem::Requirement
119
- none: false
120
133
  requirements:
121
134
  - - ! '>='
122
135
  - !ruby/object:Gem::Version
123
136
  version: '0'
124
137
  requirements: []
125
138
  rubyforge_project: dq-readability
126
- rubygems_version: 1.8.11
139
+ rubygems_version: 2.1.11
127
140
  signing_key:
128
- specification_version: 3
141
+ specification_version: 4
129
142
  summary: Port of arc90's readability project to ruby
130
143
  test_files: []