breakout_parser 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +16 -0
- data/LICENSE +39 -0
- data/Rakefile +99 -0
- data/VERSION +1 -0
- data/ext/breakout_parser/_make.sh +7 -0
- data/ext/breakout_parser/extconf.rb +16 -0
- data/ext/breakout_parser/lex.yy.c +2449 -0
- data/ext/breakout_parser/lex.yy.o +0 -0
- data/ext/breakout_parser/main.c +32 -0
- data/ext/breakout_parser/main.o +0 -0
- data/ext/breakout_parser/make_win32.bat +15 -0
- data/ext/breakout_parser/parser +0 -0
- data/ext/breakout_parser/parser.l +162 -0
- data/ext/breakout_parser/parser.tab.h +98 -0
- data/ext/breakout_parser/parser.tab.o +0 -0
- data/ext/breakout_parser/parser.y +357 -0
- data/ext/breakout_parser/ruby_ext.c +55 -0
- data/ext/breakout_parser/ruby_ext.o +0 -0
- data/ext/breakout_parser/test.rb +3 -0
- data/ext/breakout_parser/yywrap.c +3 -0
- data/ext/breakout_parser/yywrap.o +0 -0
- data/lib/breakout_parser.rb +6 -0
- data/spec/parser_examples_spec.rb +101 -0
- data/spec/parser_spec.rb +549 -0
- metadata +87 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
#ifdef RUBY_VERSION
|
2
|
+
|
3
|
+
#include "ruby.h"
|
4
|
+
|
5
|
+
void Init_breakout_parser();
|
6
|
+
VALUE method_parse(VALUE, VALUE);
|
7
|
+
|
8
|
+
VALUE breakout_parser = Qnil;
|
9
|
+
|
10
|
+
void Init_breakout_parser() {
|
11
|
+
breakout_parser = rb_define_class("BreakoutParser",rb_cObject);
|
12
|
+
rb_define_singleton_method(breakout_parser, "parse", method_parse, 1);
|
13
|
+
}
|
14
|
+
|
15
|
+
extern char *buf, *bufptr;
|
16
|
+
extern char *in_buf, *in_pos;
|
17
|
+
extern size_t in_buf_len, bufsize;
|
18
|
+
|
19
|
+
VALUE method_parse(VALUE self, VALUE text) {
|
20
|
+
VALUE s;
|
21
|
+
char *p = RSTRING(text)->ptr;
|
22
|
+
in_buf_len = RSTRING(text)->len;
|
23
|
+
while( in_buf_len > 0 && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')){
|
24
|
+
p++;
|
25
|
+
in_buf_len--;
|
26
|
+
}
|
27
|
+
in_buf = p;
|
28
|
+
in_pos = in_buf;
|
29
|
+
|
30
|
+
bufsize = 1 + in_buf_len + in_buf_len/3; // reserve 30% of in_buf size
|
31
|
+
if(bufsize<0x100) bufsize = 0x100;
|
32
|
+
|
33
|
+
buf = ALLOC_N(char, bufsize);
|
34
|
+
bufptr = buf;
|
35
|
+
|
36
|
+
// protect buf from GC (theoretically)
|
37
|
+
rb_iv_set(self,"@obj",Data_Wrap_Struct(rb_cData,NULL,NULL,buf));
|
38
|
+
|
39
|
+
yyparse();
|
40
|
+
yylex_destroy();
|
41
|
+
// printf("[.] yyparse() ended\n");
|
42
|
+
|
43
|
+
// make ruby string from our char[] data
|
44
|
+
s = rb_str_new(buf,bufptr-buf);
|
45
|
+
|
46
|
+
// cleanup
|
47
|
+
rb_iv_set(self,"@obj",Qnil);
|
48
|
+
xfree(buf);
|
49
|
+
buf = bufptr = NULL;
|
50
|
+
bufsize = 0;
|
51
|
+
|
52
|
+
return s;
|
53
|
+
}
|
54
|
+
|
55
|
+
#endif // ifdef RUBY_VERSION
|
Binary file
|
Binary file
|
@@ -0,0 +1,101 @@
|
|
1
|
+
describe 'BreakoutParser' do
|
2
|
+
|
3
|
+
describe "bad examples" do
|
4
|
+
Dir["examples/orig/*.bad"].sort.each do |fname|
|
5
|
+
it "should not die on #{fname} " do
|
6
|
+
data = File.read(fname)
|
7
|
+
parse_file(fname).size.should >= File.read(fname).strip.gsub(/\s+/,' ').size
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
describe "pending examples" do
|
12
|
+
Dir["examples/orig/*.pending"].sort.each do |fname|
|
13
|
+
it "should parse #{fname} "
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "preparsed examples" do
|
18
|
+
Dir["examples/orig/*.txt"].sort.each do |fname|
|
19
|
+
bname = File.basename(fname)
|
20
|
+
it "should parse #{fname} " do
|
21
|
+
preparsed = File.read("examples/parsed/#{bname}")
|
22
|
+
preparsed = preparsed[3..-1] if preparsed[0..2] == '<p>'
|
23
|
+
preparsed = preparsed[0..-5] if preparsed[-4..-1] == '</p>'
|
24
|
+
preparsed.gsub!("–","-")
|
25
|
+
preparsed.gsub!("—","--")
|
26
|
+
preparsed.gsub!("‘","'")
|
27
|
+
preparsed.gsub!("’","'")
|
28
|
+
preparsed.gsub!("…","...")
|
29
|
+
preparsed.gsub!("×","x")
|
30
|
+
preparsed.gsub!("©","(c)")
|
31
|
+
preparsed.gsub!("<br />\n","<br />")
|
32
|
+
preparsed.gsub!(/[ \t]+<br \/>/,"<br />")
|
33
|
+
preparsed.gsub!("\t"," ")
|
34
|
+
if preparsed['<hr />']
|
35
|
+
# find longest dash-line in source
|
36
|
+
dashline = File.read(fname).scan(/-+/).sort_by{ |x| -x.length }.first
|
37
|
+
preparsed.gsub!("</p>\n<hr />\n<p>","<br /><br />#{dashline}<br /><br />");
|
38
|
+
end
|
39
|
+
|
40
|
+
# preparsed.gsub!(/^<p>/,"");
|
41
|
+
# preparsed.gsub!(/<\/p>$/,"");
|
42
|
+
preparsed.gsub!("</pre>\n<ol>","</pre><br /><br /><ol>")
|
43
|
+
preparsed.gsub!(/<\/p>\s+<p>/,"<br /><br />")
|
44
|
+
preparsed.gsub!("</p>\n","<br /><br />")
|
45
|
+
preparsed.gsub!("<p>","<br /><br />")
|
46
|
+
preparsed.gsub!(/[\r\n]+ */," ")
|
47
|
+
preparsed.gsub!(/[ \t]{2,}/," ")
|
48
|
+
|
49
|
+
preparsed.gsub!("<del>","-")
|
50
|
+
preparsed.gsub!("</del>","-")
|
51
|
+
preparsed.gsub!(/<br \/>[ ]+/,"<br />")
|
52
|
+
preparsed.gsub!(/(<br \/>){2,}/,"<br /><br />")
|
53
|
+
# preparsed.gsub!("<br /><ol>","<ol>")
|
54
|
+
# preparsed.gsub!("<br /><ul>","<ul>")
|
55
|
+
# preparsed.gsub!("<br /><br /><ul>","<br /><ul>")
|
56
|
+
|
57
|
+
parsed = parse_file(fname)
|
58
|
+
|
59
|
+
# old parser not parses raw text urls
|
60
|
+
#parsed.gsub!(%r'<a href="([^<>"]+)">([^<>"]+)</a>',"\\1")
|
61
|
+
|
62
|
+
t1 = parsed
|
63
|
+
t2 = preparsed
|
64
|
+
|
65
|
+
[t1,t2].each do |t|
|
66
|
+
t.downcase!
|
67
|
+
t.gsub!(/(\s*<br \/>\s*)+/,' ')
|
68
|
+
t.gsub!(/\n\s*/,"\n")
|
69
|
+
# t.gsub!(/>[ \t]+</,"><")
|
70
|
+
t.gsub!(/>[ \t]+/,">")
|
71
|
+
t.gsub!(/[ \t]+</,"<")
|
72
|
+
t.gsub!(/[\r\n \t]+/," ")
|
73
|
+
t.strip!
|
74
|
+
end
|
75
|
+
|
76
|
+
if t1 != t2
|
77
|
+
# File.open("last-parsed.tmp","w"){ |f| f << parsed }
|
78
|
+
# File.open("last-preparsed.tmp","w"){ |f| f << preparsed }
|
79
|
+
pos = 0
|
80
|
+
pos += 1 while t1[0..pos] == t2[0..pos]
|
81
|
+
pos -= 5
|
82
|
+
pos = 0 if pos<0
|
83
|
+
t1[pos..-1].should == t2[pos..-1]
|
84
|
+
end
|
85
|
+
t1.should == t2
|
86
|
+
end
|
87
|
+
$n ||= 0
|
88
|
+
$n += 1
|
89
|
+
# break if $n == 1900
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
###############################################################################
|
94
|
+
###############################################################################
|
95
|
+
###############################################################################
|
96
|
+
|
97
|
+
def parse_file fname
|
98
|
+
r = `cat #{fname} | ./parser`
|
99
|
+
r.strip
|
100
|
+
end
|
101
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,549 @@
|
|
1
|
+
require 'breakout_parser'
|
2
|
+
|
3
|
+
describe 'BreakoutParser' do
|
4
|
+
def self.hex_string s
|
5
|
+
s.each_byte.to_a.map{ |c| "%02x" % c }.join
|
6
|
+
end
|
7
|
+
def hex_string s; self.class.hex_string(s); end
|
8
|
+
|
9
|
+
it 'converts \n to <br />' do
|
10
|
+
parse("aaa\nbbb").should match(%r"aaa ?<br /> ?bbb")
|
11
|
+
end
|
12
|
+
|
13
|
+
it "parses 1M file #1" do
|
14
|
+
s = 'a' * 1024 * 1024
|
15
|
+
parse(s).size.should == s.size
|
16
|
+
end
|
17
|
+
|
18
|
+
it "parses 1M file #2" do
|
19
|
+
s = 'a' + (' ' * 1024 * 1024) + 'b'
|
20
|
+
parse(s).should == 'a b'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "parses 1M file #3" do
|
24
|
+
s = 'a ' * 1024 * 512
|
25
|
+
parse(s).size.should == s.strip.size
|
26
|
+
end
|
27
|
+
|
28
|
+
it "strips tailing spaces and newlines" do
|
29
|
+
parse("aaa ").should == "aaa"
|
30
|
+
parse("aaa\t\t\t\t\t\t").should == "aaa"
|
31
|
+
parse("aaa\r\r\r\r\r").should == "aaa"
|
32
|
+
parse("aaa\n\n\n\n\n").should == "aaa"
|
33
|
+
parse("aaa\r\n\r\n\r\n\r\n").should == "aaa"
|
34
|
+
parse("aaa\r\n\t \t \n \r \n \t \t\n\r ").should == "aaa"
|
35
|
+
end
|
36
|
+
|
37
|
+
it "strips leading spaces and newlines" do
|
38
|
+
parse(" aaa").should == "aaa"
|
39
|
+
parse("\t\t\t\t\t\taaa").should == "aaa"
|
40
|
+
parse("\r\r\r\r\raaa").should == "aaa"
|
41
|
+
parse("\n\n\n\n\naaa").should == "aaa"
|
42
|
+
parse("\r\n\r\n\r\n\r\naaa").should == "aaa"
|
43
|
+
parse("\r\n\t \t \n \r \n \t \t\n\r aaa").should == "aaa"
|
44
|
+
end
|
45
|
+
|
46
|
+
it "converts two or more \\n to single empty line" do
|
47
|
+
parse("aaa\n\nbbb").should == "aaa<br /><br />bbb"
|
48
|
+
parse("aaa\n \nbbb").should == "aaa<br /><br />bbb"
|
49
|
+
parse("aaa\n\n\nbbb").should == "aaa<br /><br />bbb"
|
50
|
+
parse("aaa\n \n \nbbb").should == "aaa<br /><br />bbb"
|
51
|
+
parse("aaa\r\n \r\n \r\nbbb").should == "aaa<br /><br />bbb"
|
52
|
+
parse("aaa\n \n\n \nbbb").should == "aaa<br /><br />bbb"
|
53
|
+
parse("aaa\n \n\n\n \nbbb").should == "aaa<br /><br />bbb"
|
54
|
+
parse("aaa\n\n\n\n\n\n\nbbb").should == "aaa<br /><br />bbb"
|
55
|
+
parse("aaa\r\n\r\n\r\nbbb").should == "aaa<br /><br />bbb"
|
56
|
+
end
|
57
|
+
|
58
|
+
###############################################################################
|
59
|
+
|
60
|
+
describe "*bold*" do
|
61
|
+
it "only" do
|
62
|
+
parse("*bold*").should == '<strong>bold</strong>'
|
63
|
+
end
|
64
|
+
it "at beginning" do
|
65
|
+
parse("*bold*\nxxx").should == '<strong>bold</strong><br />xxx'
|
66
|
+
end
|
67
|
+
it "in the middle of text" do
|
68
|
+
parse("xxx *bold* yyy").should == 'xxx <strong>bold</strong> yyy'
|
69
|
+
end
|
70
|
+
it "parses *multiline\\nbold*" do
|
71
|
+
parse("*multiline\nbold*").should == "<strong>multiline<br />bold</strong>"
|
72
|
+
end
|
73
|
+
it "skips lone star inside bold block" do
|
74
|
+
parse("*aaa * bbb*").should == '<strong>aaa * bbb</strong>'
|
75
|
+
end
|
76
|
+
it "skips lone star" do
|
77
|
+
parse("aaa * bbb").should == 'aaa * bbb'
|
78
|
+
end
|
79
|
+
it "w/o closing tag" do
|
80
|
+
parse("*bold").should == '<strong>bold</strong>'
|
81
|
+
end
|
82
|
+
it "nesting1 w/o closing tags" do
|
83
|
+
parse("*bold1 *bold2").should == '<strong>bold1 <strong>bold2</strong></strong>'
|
84
|
+
end
|
85
|
+
it "nesting2 w/o closing tags" do
|
86
|
+
parse("*bold1 *bold2").should == '<strong>bold1 <strong>bold2</strong></strong>'
|
87
|
+
end
|
88
|
+
|
89
|
+
it "not parses '*.*'" do
|
90
|
+
parse("*.*").should == "*.*"
|
91
|
+
parse(" *.* ").should == "*.*"
|
92
|
+
parse("aaa *.* bbb").should == "aaa *.* bbb"
|
93
|
+
end
|
94
|
+
|
95
|
+
it "not parses '*.something'" do
|
96
|
+
parse("*.exe").should == "*.exe"
|
97
|
+
parse(" *.exe ").should == "*.exe"
|
98
|
+
parse("aaa *.exe bbb").should == "aaa *.exe bbb"
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
###############################################################################
|
104
|
+
|
105
|
+
describe "_italic_" do
|
106
|
+
it "only" do
|
107
|
+
parse("_italic_").should == '<em>italic</em>'
|
108
|
+
end
|
109
|
+
it "at beginning" do
|
110
|
+
parse("_italic_\nxxx").should == '<em>italic</em><br />xxx'
|
111
|
+
end
|
112
|
+
it "in the middle of text" do
|
113
|
+
parse("xxx _italic_ yyy").should == 'xxx <em>italic</em> yyy'
|
114
|
+
end
|
115
|
+
it "parses _multiline\\nitalic_" do
|
116
|
+
parse("_multiline\nitalic_").should == "<em>multiline<br />italic</em>"
|
117
|
+
end
|
118
|
+
it "skips lone underscore inside italic block" do
|
119
|
+
parse("_aaa _ bbb_").should == '<em>aaa _ bbb</em>'
|
120
|
+
end
|
121
|
+
it "skips lone underscore" do
|
122
|
+
parse("aaa _ bbb").should == 'aaa _ bbb'
|
123
|
+
end
|
124
|
+
it "w/o closing tag" do
|
125
|
+
parse("_italic").should == '<em>italic</em>'
|
126
|
+
end
|
127
|
+
it "nesting1 w/o closing tags" do
|
128
|
+
parse("_italic1 _italic2").should == '<em>italic1 <em>italic2</em></em>'
|
129
|
+
end
|
130
|
+
it "nesting2 w/o closing tags" do
|
131
|
+
parse("_italic1 _italic2").should == '<em>italic1 <em>italic2</em></em>'
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
###############################################################################
|
136
|
+
|
137
|
+
describe "combinations" do
|
138
|
+
it "bold in italic" do
|
139
|
+
s = "_aaa *bbb* ccc_"
|
140
|
+
parse(s).should == "<em>aaa <strong>bbb</strong> ccc</em>"
|
141
|
+
end
|
142
|
+
it "bold in italic - no closing1" do
|
143
|
+
s = "_aaa *bbb* ccc"
|
144
|
+
parse(s).should == "<em>aaa <strong>bbb</strong> ccc</em>"
|
145
|
+
end
|
146
|
+
it "bold in italic - no closing2" do
|
147
|
+
s = "_aaa *bbb ccc"
|
148
|
+
parse(s).should == "<em>aaa <strong>bbb ccc</strong></em>"
|
149
|
+
end
|
150
|
+
|
151
|
+
it "italic in bold" do
|
152
|
+
s = "*aaa _bbb_ ccc*"
|
153
|
+
parse(s).should == "<strong>aaa <em>bbb</em> ccc</strong>"
|
154
|
+
end
|
155
|
+
it "italic in bold - no closing1" do
|
156
|
+
s = "*aaa _bbb_ ccc"
|
157
|
+
parse(s).should == "<strong>aaa <em>bbb</em> ccc</strong>"
|
158
|
+
end
|
159
|
+
it "italic in bold - no closing2" do
|
160
|
+
s = "*aaa _bbb ccc"
|
161
|
+
parse(s).should == "<strong>aaa <em>bbb ccc</em></strong>"
|
162
|
+
end
|
163
|
+
|
164
|
+
{'ul' => '*', 'ol' => '#'}.each do |l,c|
|
165
|
+
it "raw text link inside #{l.upcase}> #1" do
|
166
|
+
s = "#{c} aaa http://www.ru"
|
167
|
+
parse(s).should == "<#{l}><li>aaa <a href=\"http://www.ru\">http://www.ru</a></li></#{l}>"
|
168
|
+
end
|
169
|
+
it "raw text link inside #{l.upcase}> #2" do
|
170
|
+
s = "#{c} aaa http://www.ru\n#{c} bbb"
|
171
|
+
parse(s).should == "<#{l}><li>aaa <a href=\"http://www.ru\">http://www.ru</a></li><li>bbb</li></#{l}>"
|
172
|
+
end
|
173
|
+
it "raw text link inside #{l.upcase}> #3" do
|
174
|
+
s = "#{c} http://www.ru"
|
175
|
+
parse(s).should == "<#{l}><li><a href=\"http://www.ru\">http://www.ru</a></li></#{l}>"
|
176
|
+
end
|
177
|
+
it "raw text link inside #{l.upcase}> #4" do
|
178
|
+
s = "#{c} aaa http://www.ru bbb"
|
179
|
+
parse(s).should == "<#{l}><li>aaa <a href=\"http://www.ru\">http://www.ru</a> bbb</li></#{l}>"
|
180
|
+
end
|
181
|
+
it "two links inside #{l.upcase}>" do
|
182
|
+
s = "#{c} aaa http://www.ru http://ya.ru bbb"
|
183
|
+
parse(s).should == "<#{l}><li>aaa <a href=\"http://www.ru\">http://www.ru</a> <a href=\"http://ya.ru\">http://ya.ru</a> bbb</li></#{l}>"
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
###############################################################################
|
189
|
+
|
190
|
+
describe "unnumbered list" do
|
191
|
+
it "should work" do
|
192
|
+
parse("* a\n* b\n* c").should match(
|
193
|
+
%r"<ul><li>a</li><li>b</li><li>c</li></ul>"
|
194
|
+
)
|
195
|
+
end
|
196
|
+
it "two lists" do
|
197
|
+
s = "* a\n* b\n* c"
|
198
|
+
s = s + "\nxxx\n" + s
|
199
|
+
r = "<ul><li>a</li><li>b</li><li>c</li></ul>"
|
200
|
+
parse(s).should == "#{r}xxx<br />#{r}"
|
201
|
+
end
|
202
|
+
it "in middle of text when begins with space" do
|
203
|
+
parse("hello\n * a\n * b\n * c\nworld").should ==
|
204
|
+
"hello<br /><ul><li>a</li><li>b</li><li>c</li></ul>world"
|
205
|
+
end
|
206
|
+
it "in middle of text" do
|
207
|
+
parse("hello\n* a\n* b\n* c\nworld").should ==
|
208
|
+
"hello<br /><ul><li>a</li><li>b</li><li>c</li></ul>world"
|
209
|
+
end
|
210
|
+
it "after blank line" do
|
211
|
+
parse("hello\n\n * a\n * b\n * c\nworld").should ==
|
212
|
+
"hello<br /><br /><ul><li>a</li><li>b</li><li>c</li></ul>world"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
###############################################################################
|
217
|
+
|
218
|
+
describe "numbered list" do
|
219
|
+
it "should work" do
|
220
|
+
parse("# a\n# b\n# c").should match(
|
221
|
+
%r"<ol><li>a</li><li>b</li><li>c</li></ol>"
|
222
|
+
)
|
223
|
+
end
|
224
|
+
it "two lists" do
|
225
|
+
s = "# a\n# b\n# c"
|
226
|
+
s = s + "\nxxx\n" + s
|
227
|
+
r = "<ol><li>a</li><li>b</li><li>c</li></ol>"
|
228
|
+
parse(s).should == "#{r}xxx<br />#{r}"
|
229
|
+
end
|
230
|
+
it "in middle of text when begins with space" do
|
231
|
+
parse("hello\n # a\n # b\n # c\nworld").should ==
|
232
|
+
"hello<br /><ol><li>a</li><li>b</li><li>c</li></ol>world"
|
233
|
+
end
|
234
|
+
it "in middle of text" do
|
235
|
+
parse("hello\n# a\n# b\n# c\nworld").should ==
|
236
|
+
"hello<br /><ol><li>a</li><li>b</li><li>c</li></ol>world"
|
237
|
+
end
|
238
|
+
it "after blank line" do
|
239
|
+
parse("hello\n\n # a\n # b\n # c\nworld").should ==
|
240
|
+
"hello<br /><br /><ol><li>a</li><li>b</li><li>c</li></ol>world"
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
###############################################################################
|
245
|
+
|
246
|
+
1.upto(5) do |lvl|
|
247
|
+
describe "H#{lvl}" do
|
248
|
+
it "at the beginning" do
|
249
|
+
parse("h#{lvl}. xxx").should == "<h#{lvl} id=\"xxx\">xxx</h#{lvl}>"
|
250
|
+
end
|
251
|
+
it "after 1 line of text" do
|
252
|
+
parse("abcd\nh#{lvl}. xxx").should == "abcd<br /><h#{lvl} id=\"xxx\">xxx</h#{lvl}>"
|
253
|
+
end
|
254
|
+
it "after 2 lines of text" do
|
255
|
+
parse("abcd\ndefgh\nh#{lvl}. xxx").should == "abcd<br />defgh<br /><h#{lvl} id=\"xxx\">xxx</h#{lvl}>"
|
256
|
+
end
|
257
|
+
it "in middle of other words" do
|
258
|
+
parse("abcd defgh h#{lvl}. xxx yyy").should == "abcd defgh h#{lvl}. xxx yyy"
|
259
|
+
end
|
260
|
+
it "in middle of other lines" do
|
261
|
+
parse("abcd defgh\nh#{lvl}. xxx\nyyy").should == "abcd defgh<br /><h#{lvl} id=\"xxx\">xxx</h#{lvl}><br />yyy"
|
262
|
+
end
|
263
|
+
|
264
|
+
it "converts spaces to underscores in id" do
|
265
|
+
parse("h#{lvl}. xxx yyy z").should == "<h#{lvl} id=\"xxx___yyy_z\">xxx yyy z</h#{lvl}>"
|
266
|
+
end
|
267
|
+
it "keeps underscores in id" do
|
268
|
+
parse("h#{lvl}. xxx___yyy_z").should == "<h#{lvl} id=\"xxx___yyy_z\">xxx___yyy_z</h#{lvl}>"
|
269
|
+
end
|
270
|
+
it "keeps dashes in id" do
|
271
|
+
parse("h#{lvl}. xxx---yyy-z").should == "<h#{lvl} id=\"xxx---yyy-z\">xxx---yyy-z</h#{lvl}>"
|
272
|
+
end
|
273
|
+
it "keeps dots in id" do
|
274
|
+
parse("h#{lvl}. xxx...yyy.z").should == "<h#{lvl} id=\"xxx...yyy.z\">xxx...yyy.z</h#{lvl}>"
|
275
|
+
end
|
276
|
+
|
277
|
+
%w'Ъ ъ : ; , привет" \' ! < >'.each do |c|
|
278
|
+
it "converts id to hex if it contains \"#{c}\"" do
|
279
|
+
idhex = hex_string("xxx#{c}yyy")
|
280
|
+
parse("h#{lvl}. xxx#{c}yyy").should == "<h#{lvl} id=\"#{idhex}\">xxx#{h(c)}yyy</h#{lvl}>"
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
it "skips excess spaces" do
|
285
|
+
parse("h#{lvl}. \t xxx \t ").should == "<h#{lvl} id=\"xxx\">xxx</h#{lvl}>"
|
286
|
+
end
|
287
|
+
|
288
|
+
it "thinks that \\r is EOL" do
|
289
|
+
parse("h#{lvl}. xxx\ryyy").should == "<h#{lvl} id=\"xxx\">xxx</h#{lvl}><br />yyy"
|
290
|
+
parse("h#{lvl}. xxx\r").should == "<h#{lvl} id=\"xxx\">xxx</h#{lvl}>"
|
291
|
+
|
292
|
+
parse("h#{lvl}. xxx\r yyy").sub(' yyy','yyy').should ==
|
293
|
+
"<h#{lvl} id=\"xxx\">xxx</h#{lvl}><br />yyy"
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
###############################################################################
|
299
|
+
|
300
|
+
describe "raw text links" do
|
301
|
+
it "at the beginning" do
|
302
|
+
parse("http://www.ru").should == "<a href=\"http://www.ru\">http://www.ru</a>"
|
303
|
+
end
|
304
|
+
it "in middle of other words" do
|
305
|
+
parse("aaa bbb ccc http://www.ru ddd eee fff").should ==
|
306
|
+
"aaa bbb ccc <a href=\"http://www.ru\">http://www.ru</a> ddd eee fff"
|
307
|
+
end
|
308
|
+
it "in new line" do
|
309
|
+
parse("aaa bbb ccc\nhttp://www.ru\nddd eee fff").should match(
|
310
|
+
%r"aaa bbb ccc ?<br /> ?<a href=\"http://www.ru\">http://www.ru</a> ?<br /> ?ddd eee fff"
|
311
|
+
)
|
312
|
+
end
|
313
|
+
it "escapes '&' in link _text_" do
|
314
|
+
parse("http://www.ru/?a=1&b=2").should == "<a href=\"http://www.ru/?a=1&b=2\">http://www.ru/?a=1&b=2</a>"
|
315
|
+
end
|
316
|
+
|
317
|
+
it "parses https://" do
|
318
|
+
parse("https://www.ru").should == "<a href=\"https://www.ru\">https://www.ru</a>"
|
319
|
+
end
|
320
|
+
|
321
|
+
%w', .'.each do |c|
|
322
|
+
it "stops parsing on \"#{c} \"" do
|
323
|
+
parse("http://www.ru#{c}").should == "<a href=\"http://www.ru\">http://www.ru</a>#{c}"
|
324
|
+
parse(" http://www.ru#{c} ").should == "<a href=\"http://www.ru\">http://www.ru</a>#{c}"
|
325
|
+
parse(" http://www.ru#{c} hello!").should == "<a href=\"http://www.ru\">http://www.ru</a>#{c} hello!"
|
326
|
+
parse("xxx http://www.ru#{c} hello!").should == "xxx <a href=\"http://www.ru\">http://www.ru</a>#{c} hello!"
|
327
|
+
parse(" http://www.ru/#{c} hello!").should == "<a href=\"http://www.ru/\">http://www.ru/</a>#{c} hello!"
|
328
|
+
parse(" http://aaa.com#{c} http://bbb.com").should ==
|
329
|
+
"<a href=\"http://aaa.com\">http://aaa.com</a>#{c} <a href=\"http://bbb.com\">http://bbb.com</a>"
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
###############################################################################
|
335
|
+
|
336
|
+
describe "#ticketNum ticket links" do
|
337
|
+
it "at the beginning" do
|
338
|
+
parse("#1234").should == '<a href="/spaces/test_space/tickets/1234">#1234</a>'
|
339
|
+
end
|
340
|
+
it "in middle of other words" do
|
341
|
+
parse("aaa bbb ccc #3476 ddd eee fff").should ==
|
342
|
+
'aaa bbb ccc <a href="/spaces/test_space/tickets/3476">#3476</a> ddd eee fff'
|
343
|
+
end
|
344
|
+
it "in new line" do
|
345
|
+
parse("aaa bbb ccc\n#1234\nddd eee fff").should match(
|
346
|
+
%r|aaa bbb ccc ?<br /> ?<a href="/spaces/test_space/tickets/1234">#1234</a> ?<br /> ?ddd eee fff|
|
347
|
+
)
|
348
|
+
end
|
349
|
+
it "ignores non-digits" do
|
350
|
+
parse("#1234d").should == '#1234d'
|
351
|
+
parse("#xxx").should == '#xxx'
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
###############################################################################
|
356
|
+
|
357
|
+
describe "<pre><code>..</code></pre>" do
|
358
|
+
it "works" do
|
359
|
+
s = <<-EOF
|
360
|
+
for ( n = 0; n < max_size && \
|
361
|
+
(c = getc( yyin )) != EOF && c != '\\n'; ++n ) \
|
362
|
+
buf[n] = (char) c; \
|
363
|
+
|
364
|
+
EOF
|
365
|
+
|
366
|
+
parse("<pre><code>#{s.strip}</code></pre>").should ==
|
367
|
+
"<pre><code>#{h(s.strip)}</code></pre>"
|
368
|
+
|
369
|
+
s = <<-EOF
|
370
|
+
while ( 1 < 2 ) do
|
371
|
+
puts "<b>12345\\t54321</b>"
|
372
|
+
// *bold* comment
|
373
|
+
// _italic_ comment
|
374
|
+
end
|
375
|
+
---
|
376
|
+
* aaa
|
377
|
+
* bbb
|
378
|
+
* ccc
|
379
|
+
|
380
|
+
EOF
|
381
|
+
parse("<pre><code>#{s.strip}</code></pre>").should ==
|
382
|
+
"<pre><code>#{h(s.strip)}</code></pre>"
|
383
|
+
end
|
384
|
+
it "not parses *bold*" do
|
385
|
+
s = "<pre><code> *bold*</code></pre>"
|
386
|
+
parse(s).should == s
|
387
|
+
end
|
388
|
+
it "not parses _italic_" do
|
389
|
+
s = "<pre><code> _italic_</code></pre>"
|
390
|
+
parse(s).should == s
|
391
|
+
end
|
392
|
+
it "not parses UL lists" do
|
393
|
+
s = "<pre><code>\n * l1\n * l2\n * l3</code></pre>"
|
394
|
+
parse(s).should == s.sub("<code>\n","<code>")
|
395
|
+
end
|
396
|
+
it "not parses OL lists" do
|
397
|
+
s = "<pre><code>\n # l1\n # l2\n # l3</code></pre>"
|
398
|
+
parse(s).should == s.sub("<code>\n","<code>")
|
399
|
+
end
|
400
|
+
it "not parses H1..H5" do
|
401
|
+
1.upto(5) do |i|
|
402
|
+
s = "<pre><code>\nh#{i}. zzzzzzz\n</code></pre>"
|
403
|
+
parse(s).should == "<pre><code>h#{i}. zzzzzzz</code></pre>"
|
404
|
+
end
|
405
|
+
end
|
406
|
+
it "not parses raw text links" do
|
407
|
+
s = "<pre><code>xxx http://www.ru yyy</code></pre>"
|
408
|
+
parse(s).should == s
|
409
|
+
s = "<pre><code>http://www.ru</code></pre>"
|
410
|
+
parse(s).should == s
|
411
|
+
end
|
412
|
+
it "keeps newlines" do
|
413
|
+
s = "<pre><code>aaa\nbbb</code></pre>"
|
414
|
+
parse(s).should == s
|
415
|
+
s = "<pre><code>aaa\n\nbbb\nccc</code></pre>"
|
416
|
+
parse(s).should == s
|
417
|
+
end
|
418
|
+
|
419
|
+
it "with no spaces between <pre> and <code>" do
|
420
|
+
s = "<pre><code>aaa</code></pre>"
|
421
|
+
parse(s).should == s
|
422
|
+
end
|
423
|
+
|
424
|
+
it "with spaces between <pre> and <code>" do
|
425
|
+
s = "<pre> <code>aaa</code> </pre>"
|
426
|
+
parse(s).should == s.tr(' ','')
|
427
|
+
end
|
428
|
+
it "with spaces between <pre> and <code> and inside" do
|
429
|
+
s = "<pre> <code> aaa bbb </code> </pre>"
|
430
|
+
parse(s).should == "<pre><code> aaa bbb</code></pre>"
|
431
|
+
end
|
432
|
+
|
433
|
+
it "w/o closing tags" do
|
434
|
+
s = "<pre><code>aaa"
|
435
|
+
parse(s).should match(%r"<pre><code>aaa\n?</code></pre>")
|
436
|
+
end
|
437
|
+
|
438
|
+
it "in middle of text" do
|
439
|
+
s = "xxx <pre><code>yyyy</code></pre> jjj"
|
440
|
+
parse(s).should == s
|
441
|
+
end
|
442
|
+
|
443
|
+
it "with 2 instances" do
|
444
|
+
s = "xxx <pre><code>yyyy</code></pre> <jjj> <pre><code>asdkjaslkd</code></pre> END"
|
445
|
+
parse(s).should == s.sub('<jjj>','<jjj>')
|
446
|
+
end
|
447
|
+
|
448
|
+
it "works with unicode" do
|
449
|
+
s = "привет <pre><code> жжж </code></pre> пока!"
|
450
|
+
parse(s).should match(%r|привет ?<pre><code> жжж</code></pre> ?пока!|)
|
451
|
+
|
452
|
+
s = 'абвгдеёжзийклмнопрстуфхцчшщьыъэюя'
|
453
|
+
parse(s).should == s
|
454
|
+
|
455
|
+
s = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ'
|
456
|
+
parse(s).should == s
|
457
|
+
|
458
|
+
s = '☸☹☺☻☼☽☾☿'
|
459
|
+
parse(s).should == s
|
460
|
+
end
|
461
|
+
|
462
|
+
it "should escape lone closing tags" do
|
463
|
+
s = "</code></pre>"
|
464
|
+
parse(s).should == h(s)
|
465
|
+
end
|
466
|
+
|
467
|
+
it "should skip newlines and spaces at end" do
|
468
|
+
s = "<pre><code> aaa bbb ccc \n\n\n \t\n\n\n\r\n\r\n \t </code></pre>"
|
469
|
+
parse(s).should == "<pre><code> aaa bbb ccc</code></pre>"
|
470
|
+
end
|
471
|
+
|
472
|
+
it "escapes html chars" do
|
473
|
+
HTML_ESCAPE.each do |k,v|
|
474
|
+
parse("<pre><code>#{k}</code></pre>").should == "<pre><code>#{v}</code></pre>"
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
###############################################################################
|
480
|
+
|
481
|
+
describe "Assembla Links" do
|
482
|
+
a = {}
|
483
|
+
a["wiki:Name"] = '<a class="wiki_link" title="Name" href="/wiki/show/test_space/Name">Name</a>'
|
484
|
+
a["Name"] = '<a class="wiki_link" title="Name" href="/wiki/show/test_space/Name">Name</a>'
|
485
|
+
a["Name#Ref"] = '<a class="wiki_link" title="Name#Ref" href="/wiki/show/test_space/Name#Ref">Name#Ref</a>'
|
486
|
+
a["#Ref"] = '<a href="#Ref" title="#Ref" class="wiki_link">#Ref</a>'
|
487
|
+
a["#привет"] = %Q|<a href="##{hex_string("привет")}" title="#привет" class="wiki_link">#привет</a>|
|
488
|
+
a["#with spc"] = %Q|<a href="#with__spc" title="#with spc" class="wiki_link">#with spc</a>|
|
489
|
+
a["#with__usc"] = %Q|<a href="#with__usc" title="#with__usc" class="wiki_link">#with__usc</a>|
|
490
|
+
a["#with--dsh"] = %Q|<a href="#with--dsh" title="#with--dsh" class="wiki_link">#with--dsh</a>|
|
491
|
+
a["#with!xclm"] = %Q|<a href="##{hex_string("with!xclm")}" title="#with!xclm" class="wiki_link">#with!xclm</a>|
|
492
|
+
a["#with&"] = %Q|<a href="##{hex_string("with&")}" title="#with&" class="wiki_link">#with&amp</a>|
|
493
|
+
|
494
|
+
a["ticket:234"] = '<a href="/spaces/test_space/tickets/234">#234</a>'
|
495
|
+
a["revision:1f4bdab77be696efd"] =
|
496
|
+
'<a href="http://code.assembla.com/test_space/git/changesets/1f4bdab77be696efd">revision:1f4bdab77be696efd</a>'
|
497
|
+
a["revision:12345"] =
|
498
|
+
'<a href="http://code.assembla.com/test_space/svn/changesets/12345">revision:12345</a>'
|
499
|
+
a["r:2345"] = '<a href="http://code.assembla.com/test_space/svn/changesets/2345">revision:2345</a>'
|
500
|
+
a["r:2345ef"] = '<a href="http://code.assembla.com/test_space/git/changesets/2345ef">revision:2345ef</a>'
|
501
|
+
|
502
|
+
a["url:http://www.ru"] = '<a href="http://www.ru">http://www.ru</a>'
|
503
|
+
a["url:https://www.ru"] = '<a href="https://www.ru">https://www.ru</a>'
|
504
|
+
a["url:www.ru"] = '<a href="http://www.ru">http://www.ru</a>'
|
505
|
+
a["url:www.ru/?a=1&b=2"] = '<a href="http://www.ru/?a=1&b=2">http://www.ru/?a=1&b=2</a>'
|
506
|
+
a["url:ftp://www.ru"] = '<a href="ftp://www.ru">ftp://www.ru</a>'
|
507
|
+
a["url:/spaces/x2"] = '<a href="/spaces/x2">/spaces/x2</a>'
|
508
|
+
|
509
|
+
a.each do |k,v|
|
510
|
+
it "parses [[#{k}]]" do
|
511
|
+
parse("[[#{k}]]").should == v
|
512
|
+
end
|
513
|
+
it "parses [[#{k}|привет тест]]" do
|
514
|
+
parse("[[#{k}|привет тест]]").should == v.sub(/>.*</,">привет тест<")
|
515
|
+
end
|
516
|
+
it "parses [[#{k}|test & here]]" do
|
517
|
+
parse("[[#{k}|test & here]]").should == v.sub(/>.*</,">test & here<")
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
it "keeps unknown link types" do
|
522
|
+
s = "[[zzz:xxx]]"
|
523
|
+
parse(s).should == s
|
524
|
+
s = "[[abcd:1234]]"
|
525
|
+
parse(s).should == s
|
526
|
+
s = "[[abcd::1234]] [[abcd:1234]] [[uri:www.ru]]"
|
527
|
+
parse(s).should == s
|
528
|
+
end
|
529
|
+
|
530
|
+
it "links to ExistingFile.txt"
|
531
|
+
it "links to NotExistingFile.txt"
|
532
|
+
it "links to ExistingImage.png"
|
533
|
+
it "links to NotExistingImage.png"
|
534
|
+
end
|
535
|
+
|
536
|
+
###############################################################################
|
537
|
+
###############################################################################
|
538
|
+
###############################################################################
|
539
|
+
|
540
|
+
HTML_ESCAPE = { '&' => '&', '>' => '>', '<' => '<', '"' => '"' }
|
541
|
+
|
542
|
+
def h s
|
543
|
+
s.to_s.gsub(/[&"><]/) { |special| HTML_ESCAPE[special] }
|
544
|
+
end
|
545
|
+
|
546
|
+
def parse s
|
547
|
+
BreakoutParser.parse(s).strip
|
548
|
+
end
|
549
|
+
end
|