mediacloth 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/mediacloth/mediawikiast.rb +5 -1
- data/lib/mediacloth/mediawikihtmlgenerator.rb +8 -2
- data/lib/mediacloth/mediawikilexer.rb +82 -7
- data/lib/mediacloth/mediawikiparser.rb +211 -139
- data/lib/mediacloth/mediawikiparser.y +36 -3
- data/lib/mediacloth/mediawikiwalker.rb +6 -0
- data/test/data/html1 +4 -19
- data/test/data/html2 +2 -2
- data/test/data/html3 +1 -1
- data/test/data/html4 +1 -1
- data/test/data/html5 +14 -0
- data/test/data/html6 +4 -4
- data/test/data/html7 +2 -1
- data/test/data/html8 +1 -0
- data/test/data/input5 +2 -0
- data/test/data/input8 +1 -0
- data/test/data/lex1 +10 -11
- data/test/data/lex2 +2 -2
- data/test/data/lex3 +1 -1
- data/test/data/lex4 +1 -1
- data/test/data/lex6 +4 -4
- data/test/data/lex7 +2 -2
- data/test/data/lex8 +1 -0
- data/test/dataproducers/html.rb +6 -5
- data/test/dataproducers/lex.rb +2 -2
- data/test/debugwalker.rb +5 -0
- data/test/lexer.rb +12 -3
- data/test/testhelper.rb +1 -1
- metadata +6 -12
- data/lib/mediacloth/mediawikihtmlgenerator.rb~ +0 -105
- data/lib/mediacloth/mediawikiparser.y~ +0 -172
- data/lib/mediacloth/mediawikiwalker.rb~ +0 -62
- data/lib/mediacloth.rb~ +0 -23
- data/test/dataproducers/html.rb~ +0 -23
- data/test/debugwalker.rb~ +0 -63
- data/test/htmlgenerator.rb~ +0 -25
- data/test/lexer.rb~ +0 -57
- data/test/parser.rb~ +0 -23
- data/test/testhelper.rb~ +0 -28
@@ -16,6 +16,10 @@ class WikiAST < AST
|
|
16
16
|
|
17
17
|
end
|
18
18
|
|
19
|
+
#The node to represent paragraph with text inside
|
20
|
+
class ParagraphAST < AST
|
21
|
+
end
|
22
|
+
|
19
23
|
#The node to represent a simple or formatted text
|
20
24
|
#with more AST nodes inside.
|
21
25
|
class FormattedAST < AST
|
@@ -31,7 +35,7 @@ end
|
|
31
35
|
#The node to represent a list
|
32
36
|
class ListAST < AST
|
33
37
|
#Currently recognized types: :Bulleted, :Numbered
|
34
|
-
attr_accessor :
|
38
|
+
attr_accessor :list_type
|
35
39
|
end
|
36
40
|
|
37
41
|
#The node to represent a list item
|
@@ -23,6 +23,12 @@ protected
|
|
23
23
|
super(ast)
|
24
24
|
end
|
25
25
|
|
26
|
+
def parse_paragraph(ast)
|
27
|
+
@html += "<p>"
|
28
|
+
super(ast)
|
29
|
+
@html += "</p>"
|
30
|
+
end
|
31
|
+
|
26
32
|
def parse_text(ast)
|
27
33
|
tag = formatting_to_tag(ast)
|
28
34
|
if tag[0].empty?
|
@@ -95,9 +101,9 @@ private
|
|
95
101
|
|
96
102
|
#returns a tag name of the list in ast node
|
97
103
|
def list_tag(ast)
|
98
|
-
if ast.
|
104
|
+
if ast.list_type == :Bulleted
|
99
105
|
return "ul"
|
100
|
-
elsif ast.
|
106
|
+
elsif ast.list_type == :Numbered
|
101
107
|
return "ol"
|
102
108
|
end
|
103
109
|
end
|
@@ -44,6 +44,7 @@ class MediaWikiLexer
|
|
44
44
|
@lexer_table["-"] = method(:match_line)
|
45
45
|
@lexer_table["~"] = method(:match_signature)
|
46
46
|
@lexer_table["h"] = method(:match_inline_link)
|
47
|
+
@lexer_table["\n"] = method(:match_newline)
|
47
48
|
end
|
48
49
|
|
49
50
|
#Transforms input stream (string) into the stream of tokens.
|
@@ -52,6 +53,7 @@ class MediaWikiLexer
|
|
52
53
|
#modification. The last token [false, false] inficates EOF.
|
53
54
|
def tokenize(input)
|
54
55
|
@tokens = []
|
56
|
+
start_para
|
55
57
|
@cursor = 0
|
56
58
|
@text = input
|
57
59
|
@next_token = []
|
@@ -68,7 +70,27 @@ class MediaWikiLexer
|
|
68
70
|
@current_token[1] += @text[@token_start, 1]
|
69
71
|
else
|
70
72
|
#skip empty :TEXT tokens
|
71
|
-
|
73
|
+
unless empty_text_token?
|
74
|
+
@tokens << @current_token
|
75
|
+
unless para_breaker?(@next_token[0])
|
76
|
+
#if no paragraph was previously started
|
77
|
+
#then we should start it
|
78
|
+
start_para if !@para
|
79
|
+
else
|
80
|
+
#if we already have a paragraph this is the time to close it
|
81
|
+
end_para if @para
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
if para_breaker?(@next_token[0])
|
87
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
88
|
+
#we need to remove para start token because no para end is possible
|
89
|
+
@tokens.pop
|
90
|
+
@para = false
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
72
94
|
@next_token[1] = @text[@token_start, @cursor - @token_start]
|
73
95
|
@tokens << @next_token
|
74
96
|
#hack to enable sub-lexing!
|
@@ -77,6 +99,10 @@ class MediaWikiLexer
|
|
77
99
|
@sub_tokens = nil
|
78
100
|
end
|
79
101
|
#end of hack!
|
102
|
+
|
103
|
+
#if the next token can start the paragraph, let's try that
|
104
|
+
start_para if @tokens.last and para_starter?(@tokens.last[0])
|
105
|
+
|
80
106
|
@current_token = nil
|
81
107
|
@next_token = []
|
82
108
|
end
|
@@ -84,6 +110,13 @@ class MediaWikiLexer
|
|
84
110
|
#add the last TEXT token if it exists
|
85
111
|
@tokens << @current_token if @current_token and not empty_text_token?
|
86
112
|
|
113
|
+
#remove empty para start or finish the paragraph if necessary
|
114
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
115
|
+
@tokens.pop
|
116
|
+
@para = false
|
117
|
+
else
|
118
|
+
end_para if @para
|
119
|
+
end
|
87
120
|
#RACC wants us to put this to indicate EOF
|
88
121
|
@tokens << [false, false]
|
89
122
|
@tokens
|
@@ -98,6 +131,18 @@ class MediaWikiLexer
|
|
98
131
|
|
99
132
|
|
100
133
|
private
|
134
|
+
#Returns true if the token breaks the paragraph.
|
135
|
+
def para_breaker?(token)
|
136
|
+
[:SECTION_START, :SECTION_END,
|
137
|
+
:UL_START, :UL_END, :OL_START, :OL_END,
|
138
|
+
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
139
|
+
end
|
140
|
+
|
141
|
+
#Returns true if the paragraph can be started after the token
|
142
|
+
def para_starter?(token)
|
143
|
+
[:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
|
144
|
+
end
|
145
|
+
|
101
146
|
#-- ================== Match methods ================== ++#
|
102
147
|
|
103
148
|
#Matches anything that was not matched. Returns :TEXT to indicate
|
@@ -145,17 +190,17 @@ private
|
|
145
190
|
end
|
146
191
|
|
147
192
|
#Matches sections
|
148
|
-
# "=+" { return SECTION; }
|
149
193
|
def match_section
|
150
|
-
if
|
194
|
+
if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
|
151
195
|
i = 0
|
152
196
|
i += 1 while @text[@cursor+i, 1] == "="
|
153
197
|
@cursor += i
|
154
|
-
@next_token[0] = :SECTION
|
155
198
|
|
156
|
-
if @pair_stack.last[0] == :
|
199
|
+
if @pair_stack.last[0] == :SECTION_START
|
200
|
+
@next_token[0] = :SECTION_END
|
157
201
|
@pair_stack.pop
|
158
202
|
else
|
203
|
+
@next_token[0] = :SECTION_START
|
159
204
|
@pair_stack.push @next_token
|
160
205
|
end
|
161
206
|
else
|
@@ -326,6 +371,20 @@ private
|
|
326
371
|
end
|
327
372
|
end
|
328
373
|
|
374
|
+
#Matches new line and breaks the paragraph if two newlines are met
|
375
|
+
def match_newline
|
376
|
+
if @text[@cursor, 2] == "\n\n"
|
377
|
+
if @para
|
378
|
+
@next_token[0] = :PARA_END
|
379
|
+
# @para = false
|
380
|
+
@sub_tokens = [[:PARA_START, ""]]
|
381
|
+
@cursor += 2
|
382
|
+
return
|
383
|
+
end
|
384
|
+
end
|
385
|
+
match_other
|
386
|
+
end
|
387
|
+
|
329
388
|
#-- ================== Helper methods ================== ++#
|
330
389
|
|
331
390
|
#Checks if the token is placed at the start of the line.
|
@@ -359,10 +418,16 @@ private
|
|
359
418
|
end
|
360
419
|
|
361
420
|
#Runs sublexer to tokenize sub_text
|
362
|
-
def sub_lex(sub_text)
|
421
|
+
def sub_lex(sub_text, strip_paragraphs=true)
|
363
422
|
sub_lexer = MediaWikiLexer.new
|
364
423
|
sub_tokens = sub_lexer.tokenize(sub_text)
|
365
|
-
sub_tokens.pop
|
424
|
+
sub_tokens.pop #false token
|
425
|
+
if strip_paragraphs
|
426
|
+
#the last PARA_END token
|
427
|
+
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
428
|
+
#the first PARA_START token
|
429
|
+
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
430
|
+
end
|
366
431
|
sub_tokens
|
367
432
|
end
|
368
433
|
|
@@ -403,5 +468,15 @@ private
|
|
403
468
|
list
|
404
469
|
end
|
405
470
|
|
471
|
+
def start_para
|
472
|
+
@tokens << [:PARA_START, ""]
|
473
|
+
@para = true
|
474
|
+
end
|
475
|
+
|
476
|
+
def end_para
|
477
|
+
@tokens << [:PARA_END, ""]
|
478
|
+
@para = false
|
479
|
+
end
|
480
|
+
|
406
481
|
end
|
407
482
|
|