wraptext 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'rspec'
data/README.md ADDED
@@ -0,0 +1,27 @@
1
+ # Wraptext
2
+ ## What is it?
3
+
4
+ Wraptext is a small library designed to accept "blog-style" newline-delimited text with markup, and to return a formatted document with bare text wrapped in `<p>` tags, splitting text nodes with double newlines in them into multiple paragraphs.
5
+
6
+ ## How to use it
7
+
8
+ Add it to your gemfile:
9
+
10
+ gem 'wraptext'
11
+
12
+ Then parse your text with it:
13
+
14
+ Wraptext::Parser.new(your_html_fragment).to_html
15
+
16
+ This'll return your text fragment with bare text wrapped in paragraph tags, and text nodes that include double newlines split into distinct paragraphs. The primary intent was to enable parsing of Wordpress-generated post content into valid HTML documents, but because the parser is designed to work on generic HTML documents, may be used beyond Wordpress content.
17
+
18
+ `Wraptext::Parser` accepts a Nokogiri document, as well, if you already have an existing document you are working with. Wraptext will *not* modify the original document object you pass in; it will create its own internal Nokogiri document to build the new document tree from. You may access this new document with `#to_doc`, if desired.
19
+
20
+
21
+ ## Why not simple_format?
22
+
23
+ simple_format is not HTML-aware, and may potentially mangle HTML in ways that you don't want. For example, it would mangle `<script>` and `<pre>` sections in text, breaking them.
24
+
25
+ ## Why not regexes, like Wordpress does it?
26
+
27
+ Mostly because parsing HTML with regexes is almost never the right solution. Using Nokogiri ensures a properly-formed document.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new('spec')
data/lib/wraptext.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'nokogiri'
2
+ require 'wraptext/parser'
@@ -0,0 +1,121 @@
1
+ module Wraptext
2
+ class Parser
3
+ BLOCK_TAGS = %w"table thead tfoot caption col colgroup tbody tr td th div dl dd dt
4
+ ul ol li pre select option form map area blockquote address math style input hr
5
+ fieldset legend section article aside hgroup header footer nav p
6
+ figure figcaption details menu summary h1 h2 h3 h4 h5 h6 script"
7
+ BLOCK_TAGS_LOOKUP = Hash[*BLOCK_TAGS.map {|e| [e, 1]}.flatten]
8
+
9
+ NO_WRAP_TAG = %w"table thead tfoot caption col colgroup tbody tr td th dl dd dt
10
+ ul ol li pre select option form map area math style input hr
11
+ fieldset legend section article aside hgroup header footer nav
12
+ figure figcaption details menu summary h1 h2 h3 h4 h5 h6 script"
13
+ NO_WRAP_TAG_LOOKUP = Hash[*NO_WRAP_TAG.map {|e| [e, 1]}.flatten]
14
+
15
+ STRAIGHT_COPY_TAGS = %w"script pre textarea"
16
+ STRAIGHT_COPY_TAGS_LOOKUP = Hash[*STRAIGHT_COPY_TAGS.map {|e| [e, 1]}.flatten]
17
+ MULTIPLE_NEWLINES_REGEX = /(\r\n|\n){2,}/
18
+
19
+ def self.parse(text)
20
+ new(text).to_html
21
+ end
22
+
23
+ def initialize(text_or_nokogiri_doc)
24
+ @doc = if text_or_nokogiri_doc.is_a? Nokogiri::XML::Document
25
+ text_or_nokogiri_doc
26
+ elsif text_or_nokogiri_doc.is_a? String
27
+ Nokogiri::HTML text_or_nokogiri_doc
28
+ else
29
+ raise "#initialize requires a string or Nokogiri document"
30
+ end
31
+ @root = Nokogiri::HTML "<body></body>"
32
+ reparent_nodes @root.xpath("/html/body").first, @doc.xpath("/html/body").first
33
+ strip_empty_paragraphs!
34
+ end
35
+
36
+ def to_html
37
+ @html ||= @root.xpath("/html/body").inner_html
38
+ end
39
+
40
+ def to_doc
41
+ @doc_out ||= @root.xpath("/html/body").first
42
+ end
43
+
44
+ private
45
+
46
+ def strip_empty_paragraphs!
47
+ @root.xpath("//p").each do |node|
48
+ if node.text.strip == ''
49
+ empty = true
50
+ node.children.each do |child|
51
+ if child.name != "text"
52
+ empty = false
53
+ break
54
+ end
55
+ end
56
+ node.remove if empty
57
+ end
58
+ end
59
+ end
60
+
61
+ # This traverses the entire document, and where it finds double newlines in text,
62
+ # it replaces them with <p> tags. This is a document-oriented approach to this
63
+ # problem, rather than a regex-oriented one like Wordpress takes in PHP.
64
+ # simple_format is not appropriate here, as it does not consider block-level
65
+ # html element context when performing substitutions.
66
+ def reparent_nodes(top, parent)
67
+ for i in (0..parent.children.length - 1) do
68
+ node = parent.children[i]
69
+ # Block-level tags like <div> and <blockquote> should be traversed into.
70
+ if BLOCK_TAGS_LOOKUP.has_key? node.name
71
+ # If we hit a block-level tag, we need to unwind any <p> tags we've inserted; block level elements are
72
+ # siblings to <p> tags, not children.
73
+ top = top.parent while top.name == "p"
74
+
75
+ # Some tags we don't want to traverse into, like <pre> and <script>. Just copy them into the doc.
76
+ if STRAIGHT_COPY_TAGS_LOOKUP.has_key? node.name
77
+ top.add_child node.clone
78
+ else
79
+ # If this is a block-level element, we'll create a new empty version of it, stick it into the doc,
80
+ # then recurse over the original node's children to populate it.
81
+ copy = @root.create_element node.name, node.attributes
82
+ top.add_child copy
83
+ reparent_nodes copy, node
84
+ end
85
+
86
+ # If this is a text node, we need to make sure it gets wrapped in a P, unless it's in an element that
87
+ # effectively replaces <p>, like <h1>.
88
+ # Text is split on double newlines, and each element is given its own <p> tag. If the text already exists
89
+ # in a <p> tag, the existing tag is re-used for the first chunk.
90
+ elsif node.text?
91
+ node.content.split(MULTIPLE_NEWLINES_REGEX).each_with_index do |text, index|
92
+ if (index == 0 and top.name == "p") or NO_WRAP_TAG_LOOKUP.has_key?(top.name)
93
+ top.add_child @root.create_text_node(text)
94
+ elsif top.name == "p"
95
+ p = @root.create_element "p", text
96
+ top.after p
97
+ top = p
98
+ else
99
+ p = @root.create_element "p", text
100
+ top.add_child p
101
+ top = p
102
+ end
103
+ end
104
+
105
+ # If this isn't a block or text node, we need to copy it into the new document. If it's a <p> node, then
106
+ # we just copy it in directly. Else, wrap it in a <p> tag and copy it in.
107
+ # This allows things like "<em>Foo</em> Bar Baz" to be wrapped in a single tag, as the <em> tag will be
108
+ # wrapped in a <p> tag, then the text node will reuse the existing <p> tag when it is parsed.
109
+ else
110
+ if top.name == "p"
111
+ top.add_child node.clone
112
+ else
113
+ p = @root.create_element "p", text
114
+ top.add_child p
115
+ top = p
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'wraptext'
4
+
5
+ RSpec.configure do |config|
6
+ end
@@ -0,0 +1,116 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wraptext::Parser do
4
+ it "should accept a String document" do
5
+ Wraptext::Parser.new("This is a string document").should be_a(Wraptext::Parser)
6
+ end
7
+
8
+ it "should accept a Nokogiri document" do
9
+ doc = Nokogiri::HTML "<body>foo</body>"
10
+ Wraptext::Parser.new(doc).should be_a(Wraptext::Parser)
11
+ end
12
+
13
+ context "given a document" do
14
+ before :each do
15
+ doc = <<-EOF
16
+ This is some text.
17
+
18
+ This is some more text.
19
+ EOF
20
+ @doc = Wraptext::Parser.new(doc)
21
+ end
22
+
23
+ it "should return a string from #to_html" do
24
+ @doc.to_html.should be_a(String)
25
+ end
26
+
27
+ it "should return a Nokogiri::XML::Element from #to_doc" do
28
+ @doc.to_doc.should be_a(Nokogiri::XML::Element)
29
+ end
30
+ end
31
+
32
+ context "given a set of plain text" do
33
+ before :each do
34
+ doc = <<-EOF
35
+ This is some text.
36
+
37
+ This is some more text.
38
+ EOF
39
+ @doc = Wraptext::Parser.new(doc)
40
+ end
41
+
42
+ it "should convert plain text to p-wrapped text" do
43
+ expects = <<-EOF
44
+ <p>This is some text.</p>
45
+ <p> This is some more text.
46
+ </p>
47
+ EOF
48
+ @doc.to_html.should == expects.strip
49
+ end
50
+ end
51
+
52
+ context "given plain text with a block element in the middle" do
53
+ it "should respect block-level elements" do
54
+ doc = <<-EOF
55
+ This is some text
56
+ <div>This is a block level element</div>
57
+ This is some text after the block element
58
+ EOF
59
+ expects = <<-EOF
60
+ <p>This is some text
61
+ </p>
62
+ <div><p>This is a block level element</p></div>
63
+ <p>
64
+ This is some text after the block element
65
+ </p>
66
+ EOF
67
+ Wraptext::Parser.new(doc).to_html.should == expects.strip
68
+ end
69
+ end
70
+
71
+
72
+ context "given plain text with some p-peer tags" do
73
+ it "should not inject p tags directly inside p-peer tags" do
74
+ doc = <<-EOF
75
+ This is some text
76
+ <h1>This is a p-peer element</h1>
77
+ This is some text after the block element
78
+ EOF
79
+ expects = <<-EOF
80
+ <p>This is some text
81
+ </p>
82
+ <h1>This is a p-peer element</h1>
83
+ <p>
84
+ This is some text after the block element
85
+ </p>
86
+ EOF
87
+ Wraptext::Parser.new(doc).to_html.should == expects.strip
88
+ end
89
+ end
90
+
91
+ context "given a <script> tag" do
92
+ it "should not perform any transformation inside the tag" do
93
+ doc = <<-EOF
94
+ This is some precursor text
95
+
96
+ And another line
97
+ <script>
98
+ var elem = 'this is some javascript';
99
+
100
+ elem = elem.toUpperCase();
101
+ </script>
102
+ EOF
103
+ expects = <<-EOF
104
+ <p>This is some precursor text</p>
105
+ <p>And another line
106
+ </p>
107
+ <script>
108
+ var elem = 'this is some javascript';
109
+
110
+ elem = elem.toUpperCase();
111
+ </script>
112
+ EOF
113
+ Wraptext::Parser.new(doc).to_html.should == expects.strip
114
+ end
115
+ end
116
+ end
data/wraptext.gemspec ADDED
@@ -0,0 +1,17 @@
1
+ # -*- encoding: utf-8 -*-
2
+ Gem::Specification.new do |gem|
3
+ gem.authors = ["Chris Heald"]
4
+ gem.email = ["cheald@gmail.com"]
5
+ gem.description = %q{Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines. Conveniently serves to format Wordpress post content properly as a side effect.}
6
+ gem.summary = %q{Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines.}
7
+ gem.homepage = ""
8
+
9
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
10
+ gem.files = `git ls-files`.split("\n")
11
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
12
+ gem.name = "wraptext"
13
+ gem.require_paths = ["lib"]
14
+ gem.version = "0.1"
15
+
16
+ gem.add_dependency('nokogiri')
17
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wraptext
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Chris Heald
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2012-01-29 00:00:00 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: nokogiri
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ hash: 3
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ description: Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines. Conveniently serves to format Wordpress post content properly as a side effect.
34
+ email:
35
+ - cheald@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files: []
41
+
42
+ files:
43
+ - .gitignore
44
+ - .rspec
45
+ - Gemfile
46
+ - README.md
47
+ - Rakefile
48
+ - lib/wraptext.rb
49
+ - lib/wraptext/parser.rb
50
+ - spec/spec_helper.rb
51
+ - spec/wraptext/parser_spec.rb
52
+ - wraptext.gemspec
53
+ homepage: ""
54
+ licenses: []
55
+
56
+ post_install_message:
57
+ rdoc_options: []
58
+
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ hash: 3
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ requirements: []
80
+
81
+ rubyforge_project:
82
+ rubygems_version: 1.8.12
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Wraps bare text nodes from an HTML document in <p> tags and splits text nodes on double newlines.
86
+ test_files:
87
+ - spec/spec_helper.rb
88
+ - spec/wraptext/parser_spec.rb
89
+ has_rdoc: