mediacloth 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,10 @@ class WikiAST < AST
16
16
 
17
17
  end
18
18
 
19
+ #The node to represent paragraph with text inside
20
+ class ParagraphAST < AST
21
+ end
22
+
19
23
  #The node to represent a simple or formatted text
20
24
  #with more AST nodes inside.
21
25
  class FormattedAST < AST
@@ -31,7 +35,7 @@ end
31
35
  #The node to represent a list
32
36
  class ListAST < AST
33
37
  #Currently recognized types: :Bulleted, :Numbered
34
- attr_accessor :type
38
+ attr_accessor :list_type
35
39
  end
36
40
 
37
41
  #The node to represent a list item
@@ -23,6 +23,12 @@ protected
23
23
  super(ast)
24
24
  end
25
25
 
26
+ def parse_paragraph(ast)
27
+ @html += "<p>"
28
+ super(ast)
29
+ @html += "</p>"
30
+ end
31
+
26
32
  def parse_text(ast)
27
33
  tag = formatting_to_tag(ast)
28
34
  if tag[0].empty?
@@ -95,9 +101,9 @@ private
95
101
 
96
102
  #returns a tag name of the list in ast node
97
103
  def list_tag(ast)
98
- if ast.type == :Bulleted
104
+ if ast.list_type == :Bulleted
99
105
  return "ul"
100
- elsif ast.type == :Numbered
106
+ elsif ast.list_type == :Numbered
101
107
  return "ol"
102
108
  end
103
109
  end
@@ -44,6 +44,7 @@ class MediaWikiLexer
44
44
  @lexer_table["-"] = method(:match_line)
45
45
  @lexer_table["~"] = method(:match_signature)
46
46
  @lexer_table["h"] = method(:match_inline_link)
47
+ @lexer_table["\n"] = method(:match_newline)
47
48
  end
48
49
 
49
50
  #Transforms input stream (string) into the stream of tokens.
@@ -52,6 +53,7 @@ class MediaWikiLexer
52
53
  #modification. The last token [false, false] inficates EOF.
53
54
  def tokenize(input)
54
55
  @tokens = []
56
+ start_para
55
57
  @cursor = 0
56
58
  @text = input
57
59
  @next_token = []
@@ -68,7 +70,27 @@ class MediaWikiLexer
68
70
  @current_token[1] += @text[@token_start, 1]
69
71
  else
70
72
  #skip empty :TEXT tokens
71
- @tokens << @current_token unless empty_text_token?
73
+ unless empty_text_token?
74
+ @tokens << @current_token
75
+ unless para_breaker?(@next_token[0])
76
+ #if no paragraph was previously started
77
+ #then we should start it
78
+ start_para if !@para
79
+ else
80
+ #if we already have a paragraph this is the time to close it
81
+ end_para if @para
82
+ end
83
+
84
+ end
85
+
86
+ if para_breaker?(@next_token[0])
87
+ if @tokens.last and @tokens.last[0] == :PARA_START
88
+ #we need to remove para start token because no para end is possible
89
+ @tokens.pop
90
+ @para = false
91
+ end
92
+ end
93
+
72
94
  @next_token[1] = @text[@token_start, @cursor - @token_start]
73
95
  @tokens << @next_token
74
96
  #hack to enable sub-lexing!
@@ -77,6 +99,10 @@ class MediaWikiLexer
77
99
  @sub_tokens = nil
78
100
  end
79
101
  #end of hack!
102
+
103
+ #if the next token can start the paragraph, let's try that
104
+ start_para if @tokens.last and para_starter?(@tokens.last[0])
105
+
80
106
  @current_token = nil
81
107
  @next_token = []
82
108
  end
@@ -84,6 +110,13 @@ class MediaWikiLexer
84
110
  #add the last TEXT token if it exists
85
111
  @tokens << @current_token if @current_token and not empty_text_token?
86
112
 
113
+ #remove empty para start or finish the paragraph if necessary
114
+ if @tokens.last and @tokens.last[0] == :PARA_START
115
+ @tokens.pop
116
+ @para = false
117
+ else
118
+ end_para if @para
119
+ end
87
120
  #RACC wants us to put this to indicate EOF
88
121
  @tokens << [false, false]
89
122
  @tokens
@@ -98,6 +131,18 @@ class MediaWikiLexer
98
131
 
99
132
 
100
133
  private
134
+ #Returns true if the token breaks the paragraph.
135
+ def para_breaker?(token)
136
+ [:SECTION_START, :SECTION_END,
137
+ :UL_START, :UL_END, :OL_START, :OL_END,
138
+ :DL_START, :DL_END, :HLINE, :PRE].include?(token)
139
+ end
140
+
141
+ #Returns true if the paragraph can be started after the token
142
+ def para_starter?(token)
143
+ [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
144
+ end
145
+
101
146
  #-- ================== Match methods ================== ++#
102
147
 
103
148
  #Matches anything that was not matched. Returns :TEXT to indicate
@@ -145,17 +190,17 @@ private
145
190
  end
146
191
 
147
192
  #Matches sections
148
- # "=+" { return SECTION; }
149
193
  def match_section
150
- if (@text[@cursor-1, 1] == "\n") or (@pair_stack.last[0] == :SECTION)
194
+ if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
151
195
  i = 0
152
196
  i += 1 while @text[@cursor+i, 1] == "="
153
197
  @cursor += i
154
- @next_token[0] = :SECTION
155
198
 
156
- if @pair_stack.last[0] == :SECTION
199
+ if @pair_stack.last[0] == :SECTION_START
200
+ @next_token[0] = :SECTION_END
157
201
  @pair_stack.pop
158
202
  else
203
+ @next_token[0] = :SECTION_START
159
204
  @pair_stack.push @next_token
160
205
  end
161
206
  else
@@ -326,6 +371,20 @@ private
326
371
  end
327
372
  end
328
373
 
374
+ #Matches new line and breaks the paragraph if two newlines are met
375
+ def match_newline
376
+ if @text[@cursor, 2] == "\n\n"
377
+ if @para
378
+ @next_token[0] = :PARA_END
379
+ # @para = false
380
+ @sub_tokens = [[:PARA_START, ""]]
381
+ @cursor += 2
382
+ return
383
+ end
384
+ end
385
+ match_other
386
+ end
387
+
329
388
  #-- ================== Helper methods ================== ++#
330
389
 
331
390
  #Checks if the token is placed at the start of the line.
@@ -359,10 +418,16 @@ private
359
418
  end
360
419
 
361
420
  #Runs sublexer to tokenize sub_text
362
- def sub_lex(sub_text)
421
+ def sub_lex(sub_text, strip_paragraphs=true)
363
422
  sub_lexer = MediaWikiLexer.new
364
423
  sub_tokens = sub_lexer.tokenize(sub_text)
365
- sub_tokens.pop
424
+ sub_tokens.pop #false token
425
+ if strip_paragraphs
426
+ #the last PARA_END token
427
+ sub_tokens.pop if sub_tokens.last[0] == :PARA_END
428
+ #the first PARA_START token
429
+ sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
430
+ end
366
431
  sub_tokens
367
432
  end
368
433
 
@@ -403,5 +468,15 @@ private
403
468
  list
404
469
  end
405
470
 
471
+ def start_para
472
+ @tokens << [:PARA_START, ""]
473
+ @para = true
474
+ end
475
+
476
+ def end_para
477
+ @tokens << [:PARA_END, ""]
478
+ @para = false
479
+ end
480
+
406
481
  end
407
482