mediacloth 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/mediacloth/mediawikiast.rb +5 -1
- data/lib/mediacloth/mediawikihtmlgenerator.rb +8 -2
- data/lib/mediacloth/mediawikilexer.rb +82 -7
- data/lib/mediacloth/mediawikiparser.rb +211 -139
- data/lib/mediacloth/mediawikiparser.y +36 -3
- data/lib/mediacloth/mediawikiwalker.rb +6 -0
- data/test/data/html1 +4 -19
- data/test/data/html2 +2 -2
- data/test/data/html3 +1 -1
- data/test/data/html4 +1 -1
- data/test/data/html5 +14 -0
- data/test/data/html6 +4 -4
- data/test/data/html7 +2 -1
- data/test/data/html8 +1 -0
- data/test/data/input5 +2 -0
- data/test/data/input8 +1 -0
- data/test/data/lex1 +10 -11
- data/test/data/lex2 +2 -2
- data/test/data/lex3 +1 -1
- data/test/data/lex4 +1 -1
- data/test/data/lex6 +4 -4
- data/test/data/lex7 +2 -2
- data/test/data/lex8 +1 -0
- data/test/dataproducers/html.rb +6 -5
- data/test/dataproducers/lex.rb +2 -2
- data/test/debugwalker.rb +5 -0
- data/test/lexer.rb +12 -3
- data/test/testhelper.rb +1 -1
- metadata +6 -12
- data/lib/mediacloth/mediawikihtmlgenerator.rb~ +0 -105
- data/lib/mediacloth/mediawikiparser.y~ +0 -172
- data/lib/mediacloth/mediawikiwalker.rb~ +0 -62
- data/lib/mediacloth.rb~ +0 -23
- data/test/dataproducers/html.rb~ +0 -23
- data/test/debugwalker.rb~ +0 -63
- data/test/htmlgenerator.rb~ +0 -25
- data/test/lexer.rb~ +0 -57
- data/test/parser.rb~ +0 -23
- data/test/testhelper.rb~ +0 -28
@@ -16,6 +16,10 @@ class WikiAST < AST
|
|
16
16
|
|
17
17
|
end
|
18
18
|
|
19
|
+
#The node to represent paragraph with text inside
|
20
|
+
class ParagraphAST < AST
|
21
|
+
end
|
22
|
+
|
19
23
|
#The node to represent a simple or formatted text
|
20
24
|
#with more AST nodes inside.
|
21
25
|
class FormattedAST < AST
|
@@ -31,7 +35,7 @@ end
|
|
31
35
|
#The node to represent a list
|
32
36
|
class ListAST < AST
|
33
37
|
#Currently recognized types: :Bulleted, :Numbered
|
34
|
-
attr_accessor :
|
38
|
+
attr_accessor :list_type
|
35
39
|
end
|
36
40
|
|
37
41
|
#The node to represent a list item
|
@@ -23,6 +23,12 @@ protected
|
|
23
23
|
super(ast)
|
24
24
|
end
|
25
25
|
|
26
|
+
def parse_paragraph(ast)
|
27
|
+
@html += "<p>"
|
28
|
+
super(ast)
|
29
|
+
@html += "</p>"
|
30
|
+
end
|
31
|
+
|
26
32
|
def parse_text(ast)
|
27
33
|
tag = formatting_to_tag(ast)
|
28
34
|
if tag[0].empty?
|
@@ -95,9 +101,9 @@ private
|
|
95
101
|
|
96
102
|
#returns a tag name of the list in ast node
|
97
103
|
def list_tag(ast)
|
98
|
-
if ast.
|
104
|
+
if ast.list_type == :Bulleted
|
99
105
|
return "ul"
|
100
|
-
elsif ast.
|
106
|
+
elsif ast.list_type == :Numbered
|
101
107
|
return "ol"
|
102
108
|
end
|
103
109
|
end
|
@@ -44,6 +44,7 @@ class MediaWikiLexer
|
|
44
44
|
@lexer_table["-"] = method(:match_line)
|
45
45
|
@lexer_table["~"] = method(:match_signature)
|
46
46
|
@lexer_table["h"] = method(:match_inline_link)
|
47
|
+
@lexer_table["\n"] = method(:match_newline)
|
47
48
|
end
|
48
49
|
|
49
50
|
#Transforms input stream (string) into the stream of tokens.
|
@@ -52,6 +53,7 @@ class MediaWikiLexer
|
|
52
53
|
#modification. The last token [false, false] inficates EOF.
|
53
54
|
def tokenize(input)
|
54
55
|
@tokens = []
|
56
|
+
start_para
|
55
57
|
@cursor = 0
|
56
58
|
@text = input
|
57
59
|
@next_token = []
|
@@ -68,7 +70,27 @@ class MediaWikiLexer
|
|
68
70
|
@current_token[1] += @text[@token_start, 1]
|
69
71
|
else
|
70
72
|
#skip empty :TEXT tokens
|
71
|
-
|
73
|
+
unless empty_text_token?
|
74
|
+
@tokens << @current_token
|
75
|
+
unless para_breaker?(@next_token[0])
|
76
|
+
#if no paragraph was previously started
|
77
|
+
#then we should start it
|
78
|
+
start_para if !@para
|
79
|
+
else
|
80
|
+
#if we already have a paragraph this is the time to close it
|
81
|
+
end_para if @para
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
if para_breaker?(@next_token[0])
|
87
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
88
|
+
#we need to remove para start token because no para end is possible
|
89
|
+
@tokens.pop
|
90
|
+
@para = false
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
72
94
|
@next_token[1] = @text[@token_start, @cursor - @token_start]
|
73
95
|
@tokens << @next_token
|
74
96
|
#hack to enable sub-lexing!
|
@@ -77,6 +99,10 @@ class MediaWikiLexer
|
|
77
99
|
@sub_tokens = nil
|
78
100
|
end
|
79
101
|
#end of hack!
|
102
|
+
|
103
|
+
#if the next token can start the paragraph, let's try that
|
104
|
+
start_para if @tokens.last and para_starter?(@tokens.last[0])
|
105
|
+
|
80
106
|
@current_token = nil
|
81
107
|
@next_token = []
|
82
108
|
end
|
@@ -84,6 +110,13 @@ class MediaWikiLexer
|
|
84
110
|
#add the last TEXT token if it exists
|
85
111
|
@tokens << @current_token if @current_token and not empty_text_token?
|
86
112
|
|
113
|
+
#remove empty para start or finish the paragraph if necessary
|
114
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
115
|
+
@tokens.pop
|
116
|
+
@para = false
|
117
|
+
else
|
118
|
+
end_para if @para
|
119
|
+
end
|
87
120
|
#RACC wants us to put this to indicate EOF
|
88
121
|
@tokens << [false, false]
|
89
122
|
@tokens
|
@@ -98,6 +131,18 @@ class MediaWikiLexer
|
|
98
131
|
|
99
132
|
|
100
133
|
private
|
134
|
+
#Returns true if the token breaks the paragraph.
|
135
|
+
def para_breaker?(token)
|
136
|
+
[:SECTION_START, :SECTION_END,
|
137
|
+
:UL_START, :UL_END, :OL_START, :OL_END,
|
138
|
+
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
139
|
+
end
|
140
|
+
|
141
|
+
#Returns true if the paragraph can be started after the token
|
142
|
+
def para_starter?(token)
|
143
|
+
[:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
|
144
|
+
end
|
145
|
+
|
101
146
|
#-- ================== Match methods ================== ++#
|
102
147
|
|
103
148
|
#Matches anything that was not matched. Returns :TEXT to indicate
|
@@ -145,17 +190,17 @@ private
|
|
145
190
|
end
|
146
191
|
|
147
192
|
#Matches sections
|
148
|
-
# "=+" { return SECTION; }
|
149
193
|
def match_section
|
150
|
-
if
|
194
|
+
if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
|
151
195
|
i = 0
|
152
196
|
i += 1 while @text[@cursor+i, 1] == "="
|
153
197
|
@cursor += i
|
154
|
-
@next_token[0] = :SECTION
|
155
198
|
|
156
|
-
if @pair_stack.last[0] == :
|
199
|
+
if @pair_stack.last[0] == :SECTION_START
|
200
|
+
@next_token[0] = :SECTION_END
|
157
201
|
@pair_stack.pop
|
158
202
|
else
|
203
|
+
@next_token[0] = :SECTION_START
|
159
204
|
@pair_stack.push @next_token
|
160
205
|
end
|
161
206
|
else
|
@@ -326,6 +371,20 @@ private
|
|
326
371
|
end
|
327
372
|
end
|
328
373
|
|
374
|
+
#Matches new line and breaks the paragraph if two newlines are met
|
375
|
+
def match_newline
|
376
|
+
if @text[@cursor, 2] == "\n\n"
|
377
|
+
if @para
|
378
|
+
@next_token[0] = :PARA_END
|
379
|
+
# @para = false
|
380
|
+
@sub_tokens = [[:PARA_START, ""]]
|
381
|
+
@cursor += 2
|
382
|
+
return
|
383
|
+
end
|
384
|
+
end
|
385
|
+
match_other
|
386
|
+
end
|
387
|
+
|
329
388
|
#-- ================== Helper methods ================== ++#
|
330
389
|
|
331
390
|
#Checks if the token is placed at the start of the line.
|
@@ -359,10 +418,16 @@ private
|
|
359
418
|
end
|
360
419
|
|
361
420
|
#Runs sublexer to tokenize sub_text
|
362
|
-
def sub_lex(sub_text)
|
421
|
+
def sub_lex(sub_text, strip_paragraphs=true)
|
363
422
|
sub_lexer = MediaWikiLexer.new
|
364
423
|
sub_tokens = sub_lexer.tokenize(sub_text)
|
365
|
-
sub_tokens.pop
|
424
|
+
sub_tokens.pop #false token
|
425
|
+
if strip_paragraphs
|
426
|
+
#the last PARA_END token
|
427
|
+
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
428
|
+
#the first PARA_START token
|
429
|
+
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
430
|
+
end
|
366
431
|
sub_tokens
|
367
432
|
end
|
368
433
|
|
@@ -403,5 +468,15 @@ private
|
|
403
468
|
list
|
404
469
|
end
|
405
470
|
|
471
|
+
def start_para
|
472
|
+
@tokens << [:PARA_START, ""]
|
473
|
+
@para = true
|
474
|
+
end
|
475
|
+
|
476
|
+
def end_para
|
477
|
+
@tokens << [:PARA_END, ""]
|
478
|
+
@para = false
|
479
|
+
end
|
480
|
+
|
406
481
|
end
|
407
482
|
|