zevarito-undress 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/lib/undress.rb +47 -0
- data/lib/undress/textile.rb +0 -7
- data/test/test_greencloth.rb +83 -3
- data/undress.gemspec +2 -2
- metadata +4 -3
data/README.rdoc
CHANGED
data/lib/undress.rb
CHANGED
@@ -27,6 +27,8 @@ module Undress
|
|
27
27
|
class Document #:nodoc:
|
28
28
|
def initialize(html, options)
|
29
29
|
@doc = Hpricot(html, options)
|
30
|
+
xhtmlize!
|
31
|
+
cleanup_indentation
|
30
32
|
end
|
31
33
|
|
32
34
|
def self.add_markup(name, grammar)
|
@@ -34,6 +36,51 @@ module Undress
|
|
34
36
|
grammar.process!(@doc)
|
35
37
|
end
|
36
38
|
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
# We try to fix those elements which aren't write as xhtml standard but more
|
43
|
+
# important we can't parse it ok without correct it before.
|
44
|
+
def xhtmlize!
|
45
|
+
(@doc/"ul|ol").each do |list|
|
46
|
+
fixup_list(list) if list.parent != "li" && list.parent.name !~ /ul|ol/
|
47
|
+
end
|
48
|
+
|
49
|
+
(@doc/"span[@style*='italic']").each { |e| e.swap "<em>#{e.inner_html}</em>" }
|
50
|
+
|
51
|
+
(@doc/"span[@style*='underline']").each { |e| e.swap "<ins>#{e.inner_html}</ins>" }
|
52
|
+
|
53
|
+
(@doc/"span[@style*='line-through']").each { |e| e.swap "<del>#{e.inner_html}</del>" }
|
54
|
+
|
55
|
+
(@doc/"span[@style*='bold']").each { |e| e.swap "<strong>#{e.inner_html}</strong>" }
|
56
|
+
end
|
57
|
+
|
58
|
+
# Delete tabs, newlines and more than 2 spaces from inside elements
|
59
|
+
# except <pre> or <code> elements
|
60
|
+
def cleanup_indentation
|
61
|
+
(@doc/"*").each do |e|
|
62
|
+
if e.elem? && e.inner_html != "" && e.name !~ (/pre|code/) && e.children.size == 0
|
63
|
+
e.inner_html = e.inner_html.gsub(/\n|\t/,"").gsub(/\s+/," ")
|
64
|
+
elsif e.text? && e.parent.name !~ /pre|code/
|
65
|
+
e.content = e.content.gsub(/\n|\t/,"").gsub(/\s+/," ").gsub(/^\s$/, "")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Fixup a badly nested list such as <ul> sibling to <li> instead inside of <li>.
|
71
|
+
def fixup_list(list)
|
72
|
+
list.children.each {|e| fixup_list(e) if e.elem? && e.name =~ /ol|ul/}
|
73
|
+
|
74
|
+
if list.parent.name != "li"
|
75
|
+
li_side = list.next_sibling if list.next_sibling && list.next_sibling.name == "li"
|
76
|
+
li_side = list.previous_sibling if list.previous_sibling && list.previous_sibling.name == "li"
|
77
|
+
|
78
|
+
if li_side
|
79
|
+
li_side.inner_html = "#{li_side.inner_html}#{list.to_html}"
|
80
|
+
list.parent.replace_child(list, "")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
37
84
|
end
|
38
85
|
|
39
86
|
module ::Hpricot #:nodoc:
|
data/lib/undress/textile.rb
CHANGED
@@ -3,13 +3,6 @@ require File.expand_path(File.dirname(__FILE__) + "/../undress")
|
|
3
3
|
module Undress
|
4
4
|
class Textile < Grammar
|
5
5
|
|
6
|
-
# delete tabs and newlines from inside elements
|
7
|
-
pre_processing("*") do |e|
|
8
|
-
if e.elem? && e.parent.doc? && e.inner_html != "" && e.name != "pre"
|
9
|
-
e.inner_html = e.inner_html.gsub(/\n|\t/,"")
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
6
|
# whitespace handling
|
14
7
|
post_processing(/\n\n+/, "\n\n")
|
15
8
|
post_processing(/\A\s+/, "")
|
data/test/test_greencloth.rb
CHANGED
@@ -2,7 +2,87 @@ require File.expand_path(File.dirname(__FILE__) + "/test_helper")
|
|
2
2
|
|
3
3
|
class Undress::GreenClothTest < Test::Unit::TestCase
|
4
4
|
def assert_renders_greencloth(greencloth, html)
|
5
|
-
assert_equal greencloth, Undress(html
|
5
|
+
assert_equal greencloth, Undress(html).to_greencloth
|
6
|
+
end
|
7
|
+
|
8
|
+
context "parsing badly indented documents" do
|
9
|
+
test "badly indent doc" do
|
10
|
+
html = "<ul>
|
11
|
+
<li>foo</li>
|
12
|
+
<li>bar</li>
|
13
|
+
<li>and x is also.</li>
|
14
|
+
</ul>"
|
15
|
+
greencloth = "* foo\n* bar\n* and x is also.\n"
|
16
|
+
assert_renders_greencloth greencloth, html
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# TODO:
|
21
|
+
# this is ok to ensure invalid html -> to greencloth but xhtmlize! must have
|
22
|
+
# tests on test_undress or something too
|
23
|
+
context "parsing not valid xhtml documents" do
|
24
|
+
test "font-weight=bold styles in <span> elements should be <strong>" do
|
25
|
+
html = "<p>some text <span style='font-weight=bold'>bold</span> with style</p>"
|
26
|
+
greencloth = "some text *bold* with style\n"
|
27
|
+
assert_renders_greencloth greencloth, html
|
28
|
+
end
|
29
|
+
|
30
|
+
test "style 'line-through' should be converted to <del> in <span> elements" do
|
31
|
+
html = "<p>with <span style='text-decoration: line-through;'>some</span> in the <span style='text-decoration-: line-through;'>paragraph</span></p>"
|
32
|
+
greencloth = "with -some- in the -paragraph-\n"
|
33
|
+
assert_renders_greencloth greencloth, html
|
34
|
+
html = "<p style='text-decoration: line-through;'>with some in the paragraph</p>"
|
35
|
+
greencloth = "with some in the paragraph\n"
|
36
|
+
assert_renders_greencloth greencloth, html
|
37
|
+
end
|
38
|
+
|
39
|
+
test "style 'underline' should be converted to <ins> in <span> elements" do
|
40
|
+
html = "<p>with <span style='text-decoration: underline;'>some</span> in the <span style='text-decoration: underline;'>paragraph</span></p>"
|
41
|
+
greencloth = "with +some+ in the +paragraph+\n"
|
42
|
+
assert_renders_greencloth greencloth, html
|
43
|
+
html = "<p style='text-decoration: underline;'>with some in the paragraph</p>"
|
44
|
+
greencloth = "with some in the paragraph\n"
|
45
|
+
assert_renders_greencloth greencloth, html
|
46
|
+
end
|
47
|
+
|
48
|
+
test "style 'italic' should be converted to <em> in <span> elements" do
|
49
|
+
html = "<p>with <span style='font-style: italic;'>some</span> in the <span style='font-style: italic;'>paragraph</span></p>"
|
50
|
+
greencloth = "with _some_ in the _paragraph_\n"
|
51
|
+
assert_renders_greencloth greencloth, html
|
52
|
+
html = "<p style='font-style: italic;'>with some in the paragraph</p>"
|
53
|
+
greencloth = "with some in the paragraph\n"
|
54
|
+
assert_renders_greencloth greencloth, html
|
55
|
+
end
|
56
|
+
|
57
|
+
test "a nested invalid unordered list" do
|
58
|
+
html = "<ul><li>item 1</li><li>item 2</li><ul><li>nested 1</li><li>nested 2</li></ul><li>item 3</li></ul>"
|
59
|
+
greencloth = "* item 1\n* item 2\n** nested 1\n** nested 2\n* item 3\n"
|
60
|
+
assert_renders_greencloth greencloth, html
|
61
|
+
end
|
62
|
+
|
63
|
+
test "a nested invalid ordered list" do
|
64
|
+
html = "<ol><li>item 1</li><li>item 2</li><ol><li>nested 1</li><li>nested 2</li></ol><li>item 3</li></ol>"
|
65
|
+
greencloth = "# item 1\n# item 2\n## nested 1\n## nested 2\n# item 3\n"
|
66
|
+
assert_renders_greencloth greencloth, html
|
67
|
+
end
|
68
|
+
|
69
|
+
test "a nested invalid mixed list with 3 levels" do
|
70
|
+
html = "<ul><li>item 1</li><li>item 2</li><ol><li>nested 1</li><li>nested 2</li><ul><li>nested2 1</li><li>nested2 2</li></ul></ol><li>item 3</li></ul>"
|
71
|
+
greencloth = "* item 1\n* item 2\n*# nested 1\n*# nested 2\n*#* nested2 1\n*#* nested2 2\n* item 3\n"
|
72
|
+
assert_renders_greencloth greencloth, html
|
73
|
+
end
|
74
|
+
|
75
|
+
test "a nested invalid mixed list" do
|
76
|
+
html = "<ul><li>item 1</li><li>item 2</li><ol><li>nested 1</li><li>nested 2</li></ol><li>item 3</li></ul>"
|
77
|
+
greencloth = "* item 1\n* item 2\n*# nested 1\n*# nested 2\n* item 3\n"
|
78
|
+
assert_renders_greencloth greencloth, html
|
79
|
+
end
|
80
|
+
|
81
|
+
test "2 badly nested list inside" do
|
82
|
+
html = "<ul><li>item 1</li><li>item 2</li><ul><li>nested 1</li><ul><li>item 1x</li><li>item 2x</li></ul><li>nested 2</li></ul><li>item 3</li></ul>"
|
83
|
+
greencloth = "* item 1\n* item 2\n** nested 1\n*** item 1x\n*** item 2x\n** nested 2\n* item 3\n"
|
84
|
+
assert_renders_greencloth greencloth, html
|
85
|
+
end
|
6
86
|
end
|
7
87
|
|
8
88
|
# unallowed tags
|
@@ -52,13 +132,13 @@ class Undress::GreenClothTest < Test::Unit::TestCase
|
|
52
132
|
context "embed and object" do
|
53
133
|
test "embed" do
|
54
134
|
html = "<p>do you like my embedded blip.tv <embed src='http://blip.tv/play/Ac3GfI+2HA' allowfullscreen='true' type='application/x-shockwave-flash' allowscriptaccess='always' height='510' width='720' />?</p>"
|
55
|
-
greencloth = "do you like my embedded blip.tv <embed src=\"http://blip.tv/play/Ac3GfI+2HA\"
|
135
|
+
greencloth = "do you like my embedded blip.tv <embed allowfullscreen=\"true\" src=\"http://blip.tv/play/Ac3GfI+2HA\" allowscriptaccess=\"always\" type=\"application/x-shockwave-flash\" height=\"510\" width=\"720\" />?\n"
|
56
136
|
assert_renders_greencloth greencloth, html
|
57
137
|
end
|
58
138
|
|
59
139
|
test "object" do
|
60
140
|
html = "<p>do you like my embedded youtube <object width='425' height='344'><param name='movie' value='http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1' /><param name='allowFullScreen' value='true' /><embed src='http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1' type='application/x-shockwave-flash' width='425' height='344' allowfullscreen='true' /></object>?</p>"
|
61
|
-
greencloth = "do you like my embedded youtube <object height=\"344\" width=\"425\"><param name=\"movie\" value=\"http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1\" /><param name=\"allowFullScreen\" value=\"true\" /><embed src=\"http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1\"
|
141
|
+
greencloth = "do you like my embedded youtube <object height=\"344\" width=\"425\"><param name=\"movie\" value=\"http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1\" /><param name=\"allowFullScreen\" value=\"true\" /><embed allowfullscreen=\"true\" src=\"http://www.youtube.com/v/suvDQoXA-TA&hl=en&fs=1\" type=\"application/x-shockwave-flash\" height=\"344\" width=\"425\" /></object>?\n"
|
62
142
|
assert_renders_greencloth greencloth, html
|
63
143
|
end
|
64
144
|
end
|
data/undress.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "undress"
|
3
|
-
s.version = "0.
|
4
|
-
s.date = "2009-07-
|
3
|
+
s.version = "0.2"
|
4
|
+
s.date = "2009-07-29"
|
5
5
|
|
6
6
|
s.description = "Simply translate HTML to Textile, Markdown, or whatever other markup format you need"
|
7
7
|
s.summary = "Convert HTML into other markup languages"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zevarito-undress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Nicol\xC3\xA1s Sanguinetti"
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-07-
|
12
|
+
date: 2009-07-29 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- test/test_greencloth.rb
|
78
78
|
has_rdoc: true
|
79
79
|
homepage: http://undress.rubyforge.org
|
80
|
+
licenses:
|
80
81
|
post_install_message:
|
81
82
|
rdoc_options: []
|
82
83
|
|
@@ -97,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
98
|
requirements: []
|
98
99
|
|
99
100
|
rubyforge_project: undress
|
100
|
-
rubygems_version: 1.
|
101
|
+
rubygems_version: 1.3.5
|
101
102
|
signing_key:
|
102
103
|
specification_version: 2
|
103
104
|
summary: Convert HTML into other markup languages
|