orph 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +25 -0
- data/Rakefile +10 -0
- data/lib/orph.rb +56 -0
- data/lib/orph/version.rb +3 -0
- data/orph.gemspec +26 -0
- data/test/orph_test.rb +89 -0
- metadata +80 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Orph
|
2
|
+
====
|
3
|
+
|
4
|
+
Orphans (commonly referred to as 'widows') are single-word lines at the end of paragraphs and are generally considered bad form by type nerds. This library removes them with non-breaking spaces.
|
5
|
+
|
6
|
+
## Usage
|
7
|
+
|
8
|
+
>> orph = Orph.new
|
9
|
+
=> #<Orph:0x000001008e7578 @content_tags=["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "dt", "dd"]>
|
10
|
+
|
11
|
+
>> orph.fix("<p>Here's some content.</p>")
|
12
|
+
=> "<p>Here's some content.</p>"
|
13
|
+
|
14
|
+
>> orph.fix "<p><span>some content</span><span>more content</span></p>"
|
15
|
+
=> "<p><span>some content</span><span>more content</span></p>"
|
16
|
+
|
17
|
+
>> orph.content_tags << "span"
|
18
|
+
=> ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "dt", "dd", "span"]
|
19
|
+
|
20
|
+
>> orph.fix "<p><span>some content</span><span>more content</span></p>"
|
21
|
+
=> "<p><span>some content</span><span>more content</span></p>"
|
22
|
+
|
23
|
+
* * *
|
24
|
+
|
25
|
+
(c) 2011 David Eisinger
|
data/Rakefile
ADDED
data/lib/orph.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require "orph/version"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
class Orph
|
5
|
+
attr_accessor :content_tags
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
self.content_tags = %w(h1 h2 h3 h4 h5 h6 p li blockquote dt dd)
|
9
|
+
end
|
10
|
+
|
11
|
+
def fix(html)
|
12
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(html, "ASCII")
|
13
|
+
parse_nodes(doc.children)
|
14
|
+
doc.to_html
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def parse_nodes(nodes)
|
20
|
+
nodes.each do |node|
|
21
|
+
if content_node?(node)
|
22
|
+
remove_widow(node.children)
|
23
|
+
else
|
24
|
+
parse_nodes(node.children)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def text_node?(node)
|
30
|
+
node.is_a?(Nokogiri::XML::Text) && !node.blank?
|
31
|
+
end
|
32
|
+
|
33
|
+
def content_tag?(node)
|
34
|
+
content_tags.include?(node.name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def content_node?(node)
|
38
|
+
content_tag?(node) && !node.children.all? { |child| child.blank? || content_tag?(child) }
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_widow(nodes)
|
42
|
+
smash = lambda { |html| html.reverse.sub(" ", " ".reverse).reverse }
|
43
|
+
|
44
|
+
nodes.reverse.each do |node|
|
45
|
+
if text_node?(node) && node.to_s.include?(" ")
|
46
|
+
node.replace smash[node.to_html]
|
47
|
+
return true
|
48
|
+
elsif !node.is_a?(Nokogiri::XML::Text)
|
49
|
+
return true if remove_widow(node.children)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
data/lib/orph/version.rb
ADDED
data/orph.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "orph/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "orph"
|
7
|
+
s.version = Orph::VERSION
|
8
|
+
s.authors = ["David Eisinger"]
|
9
|
+
s.email = ["david.eisinger@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Orph is a library for removing typographic orphans}
|
12
|
+
s.description = %q{Orphans (commonly referred to as 'widows') are
|
13
|
+
single-word lines at the end of paragraphs. This
|
14
|
+
library removes them with non-breaking spaces.}
|
15
|
+
|
16
|
+
s.rubyforge_project = "orph"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_dependency "nokogiri"
|
24
|
+
|
25
|
+
s.add_development_dependency "riot"
|
26
|
+
end
|
data/test/orph_test.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require "riot"
|
2
|
+
require "orph"
|
3
|
+
|
4
|
+
context "Orph, removing orphans" do
|
5
|
+
setup { Orph.new }
|
6
|
+
|
7
|
+
context "with a single HTML tag" do
|
8
|
+
setup { topic.fix("<p>these are words</p>") }
|
9
|
+
asserts_topic.equals("<p>these are words</p>")
|
10
|
+
end
|
11
|
+
|
12
|
+
context "with multiple HTML tags" do
|
13
|
+
setup { topic.fix("<p>one word</p><p>two words</p>") }
|
14
|
+
asserts_topic.equals("<p>one word</p><p>two words</p>")
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with a non-block-level nested tag" do
|
18
|
+
setup { topic.fix "<p>This is <strong>important</strong>.</p>" }
|
19
|
+
asserts_topic.equals("<p>This is <strong>important</strong>.</p>")
|
20
|
+
end
|
21
|
+
|
22
|
+
context "with nested HTML tags" do
|
23
|
+
setup { topic.fix("<ul><li>one word</li><li>two words</li></ul>").gsub(/\n/, '') }
|
24
|
+
asserts_topic.equals("<ul><li>one word</li><li>two words</li></ul>")
|
25
|
+
end
|
26
|
+
|
27
|
+
context "with a space inside an HTML tag" do
|
28
|
+
setup { topic.fix('<p><a href="http://google.com/">Google</a></p>') }
|
29
|
+
asserts_topic.equals('<p><a href="http://google.com/">Google</a></p>')
|
30
|
+
end
|
31
|
+
|
32
|
+
context "with text and a short link" do
|
33
|
+
setup { topic.fix('<p>This is a <a href="#">link</a></p>') }
|
34
|
+
asserts_topic.equals('<p>This is a <a href="#">link</a></p>')
|
35
|
+
end
|
36
|
+
|
37
|
+
context "with text, a long link, and ending text" do
|
38
|
+
setup { topic.fix('<p>This is a <a href="#">long link</a>.</p>') }
|
39
|
+
asserts_topic.equals('<p>This is a <a href="#">long link</a>.</p>')
|
40
|
+
end
|
41
|
+
|
42
|
+
context "with paragraph consisting of two links" do
|
43
|
+
setup { topic.fix('<p><a href="#">link one</a><a href="#">link two</a></p>') }
|
44
|
+
asserts_topic.equals('<p><a href="#">link one</a><a href="#">link two</a></p>')
|
45
|
+
end
|
46
|
+
|
47
|
+
context "with a div with two paragraphs and free text" do
|
48
|
+
setup do
|
49
|
+
html = '<div>some text<p>paragraph 1</p><p>paragraph 2</p></div>'
|
50
|
+
topic.fix(html).gsub(/( |\n)/, "")
|
51
|
+
end
|
52
|
+
|
53
|
+
asserts_topic.includes("some text")
|
54
|
+
asserts_topic.includes("<p>paragraph 1</p>")
|
55
|
+
asserts_topic.includes("<p>paragraph 2</p>")
|
56
|
+
end
|
57
|
+
|
58
|
+
context "with a UL containing paragraphs" do
|
59
|
+
setup do
|
60
|
+
html = <<-HTML
|
61
|
+
<div>
|
62
|
+
<ul>
|
63
|
+
<li>some text</li>
|
64
|
+
<li>
|
65
|
+
<p>par. 1</p>
|
66
|
+
<p>par. 2</p>
|
67
|
+
</li>
|
68
|
+
</ul>
|
69
|
+
</div>
|
70
|
+
HTML
|
71
|
+
|
72
|
+
topic.fix(html).gsub(/( |\n)/, "")
|
73
|
+
end
|
74
|
+
|
75
|
+
asserts_topic.includes("<li>some text</li>")
|
76
|
+
asserts_topic.includes("<li><p>par. 1</p><p>par. 2</p></li>")
|
77
|
+
end
|
78
|
+
|
79
|
+
context "with span as a content container" do
|
80
|
+
setup do
|
81
|
+
topic.content_tags << "span"
|
82
|
+
topic.fix("<p><span>span 1</span><span>span 2</span></p>")
|
83
|
+
end
|
84
|
+
|
85
|
+
asserts_topic.includes("<span>span 1</span>")
|
86
|
+
asserts_topic.includes("<span>span 2</span>")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: orph
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Eisinger
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-10-03 00:00:00.000000000 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
requirement: &2153291960 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2153291960
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: riot
|
28
|
+
requirement: &2153291540 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2153291540
|
37
|
+
description: ! "Orphans (commonly referred to as 'widows') are\n single-word
|
38
|
+
lines at the end of paragraphs. This\n library removes them
|
39
|
+
with non-breaking spaces."
|
40
|
+
email:
|
41
|
+
- david.eisinger@gmail.com
|
42
|
+
executables: []
|
43
|
+
extensions: []
|
44
|
+
extra_rdoc_files: []
|
45
|
+
files:
|
46
|
+
- .gitignore
|
47
|
+
- Gemfile
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- lib/orph.rb
|
51
|
+
- lib/orph/version.rb
|
52
|
+
- orph.gemspec
|
53
|
+
- test/orph_test.rb
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: ''
|
56
|
+
licenses: []
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubyforge_project: orph
|
75
|
+
rubygems_version: 1.6.2
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Orph is a library for removing typographic orphans
|
79
|
+
test_files:
|
80
|
+
- test/orph_test.rb
|