wraptext 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +5 -0
- data/README.md +27 -0
- data/Rakefile +4 -0
- data/lib/wraptext.rb +2 -0
- data/lib/wraptext/parser.rb +121 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/wraptext/parser_spec.rb +116 -0
- data/wraptext.gemspec +17 -0
- metadata +89 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# Wraptext
|
2
|
+
## What is it?
|
3
|
+
|
4
|
+
Wraptext is a small library designed to accept "blog-style" newline-delimited text with markup, and to return a formatted document with bare text wrapped in `<p>` tags, splitting text nodes with double newlines in them into multiple paragraphs.
|
5
|
+
|
6
|
+
## How to use it
|
7
|
+
|
8
|
+
Add it to your gemfile:
|
9
|
+
|
10
|
+
gem 'wraptext'
|
11
|
+
|
12
|
+
Then parse your text with it:
|
13
|
+
|
14
|
+
Wraptext::Parser.new(your_html_fragment).to_html
|
15
|
+
|
16
|
+
This'll return your text fragment with bare text wrapped in paragraph tags, and text nodes that include double newlines split into distinct paragraphs. The primary intent was to enable parsing of Wordpress-generated post content into valid HTML documents, but because the parser is designed to work on generic HTML documents, may be used beyond Wordpress content.
|
17
|
+
|
18
|
+
`Wraptext::Parser` accepts a Nokogiri document, as well, if you already have an existing document you are working with. Wraptext will *not* modify the original document object you pass in; it will create its own internal Nokogiri document to build the new document tree from. You may access this new document with `#to_doc`, if desired.
|
19
|
+
|
20
|
+
|
21
|
+
## Why not simple_format?
|
22
|
+
|
23
|
+
simple_format is not HTML-aware, and may potentially mangle HTML in ways that you don't want. For example, it would mangle `<script>` and `<pre>` sections in text, breaking them.
|
24
|
+
|
25
|
+
## Why not regexes, like Wordpress does it?
|
26
|
+
|
27
|
+
Mostly because parsing HTML with regexes is almost never the right solution. Using Nokogiri ensures a properly-formed document.
|
data/Rakefile
ADDED
data/lib/wraptext.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
module Wraptext
|
2
|
+
class Parser
|
3
|
+
BLOCK_TAGS = %w"table thead tfoot caption col colgroup tbody tr td th div dl dd dt
|
4
|
+
ul ol li pre select option form map area blockquote address math style input hr
|
5
|
+
fieldset legend section article aside hgroup header footer nav p
|
6
|
+
figure figcaption details menu summary h1 h2 h3 h4 h5 h6 script"
|
7
|
+
BLOCK_TAGS_LOOKUP = Hash[*BLOCK_TAGS.map {|e| [e, 1]}.flatten]
|
8
|
+
|
9
|
+
NO_WRAP_TAG = %w"table thead tfoot caption col colgroup tbody tr td th dl dd dt
|
10
|
+
ul ol li pre select option form map area math style input hr
|
11
|
+
fieldset legend section article aside hgroup header footer nav
|
12
|
+
figure figcaption details menu summary h1 h2 h3 h4 h5 h6 script"
|
13
|
+
NO_WRAP_TAG_LOOKUP = Hash[*NO_WRAP_TAG.map {|e| [e, 1]}.flatten]
|
14
|
+
|
15
|
+
STRAIGHT_COPY_TAGS = %w"script pre textarea"
|
16
|
+
STRAIGHT_COPY_TAGS_LOOKUP = Hash[*STRAIGHT_COPY_TAGS.map {|e| [e, 1]}.flatten]
|
17
|
+
MULTIPLE_NEWLINES_REGEX = /(\r\n|\n){2,}/
|
18
|
+
|
19
|
+
def self.parse(text)
|
20
|
+
new(text).to_html
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(text_or_nokogiri_doc)
|
24
|
+
@doc = if text_or_nokogiri_doc.is_a? Nokogiri::XML::Document
|
25
|
+
text_or_nokogiri_doc
|
26
|
+
elsif text_or_nokogiri_doc.is_a? String
|
27
|
+
Nokogiri::HTML text_or_nokogiri_doc
|
28
|
+
else
|
29
|
+
raise "#initialize requires a string or Nokogiri document"
|
30
|
+
end
|
31
|
+
@root = Nokogiri::HTML "<body></body>"
|
32
|
+
reparent_nodes @root.xpath("/html/body").first, @doc.xpath("/html/body").first
|
33
|
+
strip_empty_paragraphs!
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_html
|
37
|
+
@html ||= @root.xpath("/html/body").inner_html
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_doc
|
41
|
+
@doc_out ||= @root.xpath("/html/body").first
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def strip_empty_paragraphs!
|
47
|
+
@root.xpath("//p").each do |node|
|
48
|
+
if node.text.strip == ''
|
49
|
+
empty = true
|
50
|
+
node.children.each do |child|
|
51
|
+
if child.name != "text"
|
52
|
+
empty = false
|
53
|
+
break
|
54
|
+
end
|
55
|
+
end
|
56
|
+
node.remove if empty
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# This traverses the entire document, and where it finds double newlines in text,
|
62
|
+
# it replaces them with <p> tags. This is a document-oriented approach to this
|
63
|
+
# problem, rather than a regex-oriented one like Wordpress takes in PHP.
|
64
|
+
# simple_format is not appropriate here, as it does not consider block-level
|
65
|
+
# html element context when performing substitutions.
|
66
|
+
def reparent_nodes(top, parent)
|
67
|
+
for i in (0..parent.children.length - 1) do
|
68
|
+
node = parent.children[i]
|
69
|
+
# Block-level tags like <div> and <blockquote> should be traversed into.
|
70
|
+
if BLOCK_TAGS_LOOKUP.has_key? node.name
|
71
|
+
# If we hit a block-level tag, we need to unwind any <p> tags we've inserted; block level elements are
|
72
|
+
# siblings to <p> tags, not children.
|
73
|
+
top = top.parent while top.name == "p"
|
74
|
+
|
75
|
+
# Some tags we don't want to traverse into, like <pre> and <script>. Just copy them into the doc.
|
76
|
+
if STRAIGHT_COPY_TAGS_LOOKUP.has_key? node.name
|
77
|
+
top.add_child node.clone
|
78
|
+
else
|
79
|
+
# If this is a block-level element, we'll create a new empty version of it, stick it into the doc,
|
80
|
+
# then recurse over the original node's children to populate it.
|
81
|
+
copy = @root.create_element node.name, node.attributes
|
82
|
+
top.add_child copy
|
83
|
+
reparent_nodes copy, node
|
84
|
+
end
|
85
|
+
|
86
|
+
# If this is a text node, we need to make sure it gets wrapped in a P, unless it's in an element that
|
87
|
+
# effectively replaces <p>, like <h1>.
|
88
|
+
# Text is split on double newlines, and each element is given its own <p> tag. If the text already exists
|
89
|
+
# in a <p> tag, the existing tag is re-used for the first chunk.
|
90
|
+
elsif node.text?
|
91
|
+
node.content.split(MULTIPLE_NEWLINES_REGEX).each_with_index do |text, index|
|
92
|
+
if (index == 0 and top.name == "p") or NO_WRAP_TAG_LOOKUP.has_key?(top.name)
|
93
|
+
top.add_child @root.create_text_node(text)
|
94
|
+
elsif top.name == "p"
|
95
|
+
p = @root.create_element "p", text
|
96
|
+
top.after p
|
97
|
+
top = p
|
98
|
+
else
|
99
|
+
p = @root.create_element "p", text
|
100
|
+
top.add_child p
|
101
|
+
top = p
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# If this isn't a block or text node, we need to copy it into the new document. If it's a <p> node, then
|
106
|
+
# we just copy it in directly. Else, wrap it in a <p> tag and copy it in.
|
107
|
+
# This allows things like "<em>Foo</em> Bar Baz" to be wrapped in a single tag, as the <em> tag will be
|
108
|
+
# wrapped in a <p> tag, then the text node will reuse the existing <p> tag when it is parsed.
|
109
|
+
else
|
110
|
+
if top.name == "p"
|
111
|
+
top.add_child node.clone
|
112
|
+
else
|
113
|
+
p = @root.create_element "p", text
|
114
|
+
top.add_child p
|
115
|
+
top = p
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wraptext::Parser do
|
4
|
+
it "should accept a String document" do
|
5
|
+
Wraptext::Parser.new("This is a string document").should be_a(Wraptext::Parser)
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should accept a Nokogiri document" do
|
9
|
+
doc = Nokogiri::HTML "<body>foo</body>"
|
10
|
+
Wraptext::Parser.new(doc).should be_a(Wraptext::Parser)
|
11
|
+
end
|
12
|
+
|
13
|
+
context "given a document" do
|
14
|
+
before :each do
|
15
|
+
doc = <<-EOF
|
16
|
+
This is some text.
|
17
|
+
|
18
|
+
This is some more text.
|
19
|
+
EOF
|
20
|
+
@doc = Wraptext::Parser.new(doc)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return a string from #to_html" do
|
24
|
+
@doc.to_html.should be_a(String)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return a Nokogiri::XML::Element from #to_doc" do
|
28
|
+
@doc.to_doc.should be_a(Nokogiri::XML::Element)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "given a set of plain text" do
|
33
|
+
before :each do
|
34
|
+
doc = <<-EOF
|
35
|
+
This is some text.
|
36
|
+
|
37
|
+
This is some more text.
|
38
|
+
EOF
|
39
|
+
@doc = Wraptext::Parser.new(doc)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should convert plain text to p-wrapped text" do
|
43
|
+
expects = <<-EOF
|
44
|
+
<p>This is some text.</p>
|
45
|
+
<p> This is some more text.
|
46
|
+
</p>
|
47
|
+
EOF
|
48
|
+
@doc.to_html.should == expects.strip
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "given plain text with a block element in the middle" do
|
53
|
+
it "should respect block-level elements" do
|
54
|
+
doc = <<-EOF
|
55
|
+
This is some text
|
56
|
+
<div>This is a block level element</div>
|
57
|
+
This is some text after the block element
|
58
|
+
EOF
|
59
|
+
expects = <<-EOF
|
60
|
+
<p>This is some text
|
61
|
+
</p>
|
62
|
+
<div><p>This is a block level element</p></div>
|
63
|
+
<p>
|
64
|
+
This is some text after the block element
|
65
|
+
</p>
|
66
|
+
EOF
|
67
|
+
Wraptext::Parser.new(doc).to_html.should == expects.strip
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
context "given plain text with some p-peer tags" do
|
73
|
+
it "should not inject p tags directly inside p-peer tags" do
|
74
|
+
doc = <<-EOF
|
75
|
+
This is some text
|
76
|
+
<h1>This is a p-peer element</h1>
|
77
|
+
This is some text after the block element
|
78
|
+
EOF
|
79
|
+
expects = <<-EOF
|
80
|
+
<p>This is some text
|
81
|
+
</p>
|
82
|
+
<h1>This is a p-peer element</h1>
|
83
|
+
<p>
|
84
|
+
This is some text after the block element
|
85
|
+
</p>
|
86
|
+
EOF
|
87
|
+
Wraptext::Parser.new(doc).to_html.should == expects.strip
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context "given a <script> tag" do
|
92
|
+
it "should not perform any transformation inside the tag" do
|
93
|
+
doc = <<-EOF
|
94
|
+
This is some precursor text
|
95
|
+
|
96
|
+
And another line
|
97
|
+
<script>
|
98
|
+
var elem = 'this is some javascript';
|
99
|
+
|
100
|
+
elem = elem.toUpperCase();
|
101
|
+
</script>
|
102
|
+
EOF
|
103
|
+
expects = <<-EOF
|
104
|
+
<p>This is some precursor text</p>
|
105
|
+
<p>And another line
|
106
|
+
</p>
|
107
|
+
<script>
|
108
|
+
var elem = 'this is some javascript';
|
109
|
+
|
110
|
+
elem = elem.toUpperCase();
|
111
|
+
</script>
|
112
|
+
EOF
|
113
|
+
Wraptext::Parser.new(doc).to_html.should == expects.strip
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
data/wraptext.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
Gem::Specification.new do |gem|
|
3
|
+
gem.authors = ["Chris Heald"]
|
4
|
+
gem.email = ["cheald@gmail.com"]
|
5
|
+
gem.description = %q{Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines. Conveniently serves to format Wordpress post content properly as a side effect.}
|
6
|
+
gem.summary = %q{Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines.}
|
7
|
+
gem.homepage = ""
|
8
|
+
|
9
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
10
|
+
gem.files = `git ls-files`.split("\n")
|
11
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
12
|
+
gem.name = "wraptext"
|
13
|
+
gem.require_paths = ["lib"]
|
14
|
+
gem.version = "0.1"
|
15
|
+
|
16
|
+
gem.add_dependency('nokogiri')
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wraptext
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Chris Heald
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2012-01-29 00:00:00 Z
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: nokogiri
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 3
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
description: Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines. Conveniently serves to format Wordpress post content properly as a side effect.
|
34
|
+
email:
|
35
|
+
- cheald@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files: []
|
41
|
+
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- .rspec
|
45
|
+
- Gemfile
|
46
|
+
- README.md
|
47
|
+
- Rakefile
|
48
|
+
- lib/wraptext.rb
|
49
|
+
- lib/wraptext/parser.rb
|
50
|
+
- spec/spec_helper.rb
|
51
|
+
- spec/wraptext/parser_spec.rb
|
52
|
+
- wraptext.gemspec
|
53
|
+
homepage: ""
|
54
|
+
licenses: []
|
55
|
+
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
hash: 3
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
requirements: []
|
80
|
+
|
81
|
+
rubyforge_project:
|
82
|
+
rubygems_version: 1.8.12
|
83
|
+
signing_key:
|
84
|
+
specification_version: 3
|
85
|
+
summary: Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines.
|
86
|
+
test_files:
|
87
|
+
- spec/spec_helper.rb
|
88
|
+
- spec/wraptext/parser_spec.rb
|
89
|
+
has_rdoc:
|