maruku 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/bin/maruku +25 -0
  2. data/bin/marutex +29 -0
  3. data/docs/Makefile +25 -0
  4. data/docs/char_codes.xml +884 -0
  5. data/docs/color-package-demo.aux +1 -0
  6. data/docs/color-package-demo.log +127 -0
  7. data/docs/color-package-demo.tex +149 -0
  8. data/docs/index.html +74 -0
  9. data/docs/markdown_syntax.aux +13 -0
  10. data/docs/markdown_syntax.html +266 -0
  11. data/docs/markdown_syntax.log +287 -0
  12. data/docs/markdown_syntax.md +920 -0
  13. data/docs/markdown_syntax.out +0 -0
  14. data/docs/markdown_syntax.pdf +0 -0
  15. data/docs/markdown_syntax.tex +1203 -0
  16. data/docs/maruku.aux +13 -0
  17. data/docs/maruku.html +74 -0
  18. data/docs/maruku.log +294 -0
  19. data/docs/maruku.md +394 -0
  20. data/docs/maruku.out +0 -0
  21. data/docs/maruku.pdf +0 -0
  22. data/docs/maruku.tex +548 -0
  23. data/docs/style.css +65 -0
  24. data/docs/todo.md +12 -0
  25. data/lib/maruku.rb +20 -0
  26. data/lib/maruku/parse_block.rb +577 -0
  27. data/lib/maruku/parse_span.rb +336 -0
  28. data/lib/maruku/string_utils.rb +270 -0
  29. data/lib/maruku/structures.rb +31 -0
  30. data/lib/maruku/to_html.rb +430 -0
  31. data/lib/maruku/to_latex.rb +345 -0
  32. data/lib/maruku/to_latex_strings.rb +330 -0
  33. data/tests/abbreviations.md +11 -0
  34. data/tests/blank.md +4 -0
  35. data/tests/code.md +5 -0
  36. data/tests/code2.md +8 -0
  37. data/tests/code3.md +16 -0
  38. data/tests/email.md +4 -0
  39. data/tests/entities.md +19 -0
  40. data/tests/escaping.md +14 -0
  41. data/tests/extra_dl.md +101 -0
  42. data/tests/extra_header_id.md +13 -0
  43. data/tests/extra_table1.md +40 -0
  44. data/tests/footnotes.md +17 -0
  45. data/tests/headers.md +10 -0
  46. data/tests/hrule.md +10 -0
  47. data/tests/images.md +20 -0
  48. data/tests/inline_html.md +35 -0
  49. data/tests/links.md +31 -0
  50. data/tests/list1.md +4 -0
  51. data/tests/list2.md +5 -0
  52. data/tests/list3.md +8 -0
  53. data/tests/lists.md +32 -0
  54. data/tests/lists_ol.md +39 -0
  55. data/tests/misc_sw.md +105 -0
  56. data/tests/one.md +1 -0
  57. data/tests/paragraphs.md +13 -0
  58. data/tests/sss06.md +352 -0
  59. data/tests/test.md +4 -0
  60. metadata +113 -0
@@ -0,0 +1,336 @@
1
+
2
+ # There are two black-magic methods `match_couple_of` and `map_match`,
3
+ # defined at the end of the file, that make the function
4
+ # `parse_lines_as_span` so elegant.
5
+
6
+ class Maruku
7
+
8
+ # Takes care of all span-level formatting, links, images, etc.
9
+ #
10
+ # Lines must not contain block-level elements.
11
+ def parse_lines_as_span(lines)
12
+
13
+ # first, get rid of linebreaks
14
+ res = resolve_linebreaks(lines)
15
+
16
+ span = MDElement.new
17
+ span.children = res
18
+
19
+ # then, encode all escapes
20
+ span.replace_each_string { |s| s.escape_md_special }
21
+
22
+ # search for ``code`` markers
23
+ span.match_couple_of('``') { |children|
24
+ e = create_md_element(:inline_code)
25
+ e.meta[:raw_code] = children.join('') # this is now opaque to processing
26
+ e
27
+ }
28
+
29
+ # Search for `single tick` code markers
30
+ span.match_couple_of('`') { |children|
31
+ e = create_md_element(:inline_code)
32
+ e.meta[:raw_code] = children.join('').unescape_md_special
33
+ # this is now opaque to processing
34
+ e
35
+ }
36
+
37
+ # Detect any immediate link: <http://www.google.com>
38
+ # we expect an http: or something: at the beginning
39
+ span.map_match( /<(\w+:[^\>]+)>/) { |match|
40
+ url = match[1]
41
+
42
+ e = create_md_element(:immediate_link, [])
43
+ e.meta[:url] = url
44
+ e
45
+ }
46
+
47
+ # Search for inline HTML (the support is pretty basic for now)
48
+
49
+ # this searches for a matching block
50
+ inlineHTML1 = %r{
51
+ ( # put everything in 1
52
+ < # open
53
+ (\w+) # opening tag in 2
54
+ > # close
55
+ .* # anything
56
+ </\2> # match closing tag
57
+ )
58
+ }x
59
+
60
+ # this searches for only one block
61
+ inlineHTML2 = %r{
62
+ ( # put everything in 1
63
+ < # open
64
+ \w+ #
65
+ # close
66
+ [^<>]* # anything except
67
+ /> # closing tag
68
+ )
69
+ }x
70
+
71
+ for reg in [inlineHTML1, inlineHTML2]
72
+ span.map_match(reg) { |match|
73
+ raw_html = (match[1] || raise("No html?"))
74
+ e = create_md_element(:raw_html)
75
+ e.meta[:raw_html] = raw_html
76
+ begin
77
+ e.meta[:parsed_html] = Document.new(raw_html)
78
+ rescue
79
+ $stderr.puts "Malformed HTML:\n#{raw_html}"
80
+ end
81
+ e
82
+ }
83
+ end
84
+
85
+ # Detect footnotes references: [^1]
86
+ span.map_match(/\[(\^[^\]]+)\]/) { |match|
87
+ id = match[1].strip.downcase
88
+ e = create_md_element(:footnote_reference)
89
+ e.meta[:footnote_id] = id
90
+ e
91
+ }
92
+
93
+ # Detect any image like ![Alt text][url]
94
+ span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
95
+ alt = match[1]
96
+ id = match[2].strip.downcase
97
+
98
+ if id.size == 0
99
+ id = text.strip.downcase
100
+ end
101
+
102
+ e = create_md_element(:image)
103
+ e.meta[:ref_id] = id
104
+ e
105
+ }
106
+
107
+ # Detect any immage with immediate url: ![Alt](url "title")
108
+ # a dummy ref is created and put in the symbol table
109
+ link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
110
+ span.map_match(link1) { |match|
111
+ alt = match[1]
112
+ url = match[2]
113
+ title = match[3]
114
+
115
+ url = url.strip
116
+ # create a dummy id
117
+ id="dummy_#{@refs.size}"
118
+ @refs[id] = {:url=>url, :title=>title}
119
+
120
+ e = create_md_element(:image)
121
+ e.meta[:ref_id] = id
122
+ e
123
+ }
124
+
125
+ # Detect any link like [Google engine][google]
126
+ span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
127
+ text = match[1]
128
+ id = match[2]
129
+ id = id.strip.downcase
130
+
131
+ if id.size == 0
132
+ id = text.strip.downcase
133
+ end
134
+
135
+ e = create_md_element(:link, [text])
136
+ e.meta[:ref_id] = id
137
+ e
138
+ }
139
+
140
+ # Detect any link with immediate url: [Google](http://www.google.com)
141
+ # a dummy ref is created and put in the symbol table
142
+ span.map_match(/\[([^\]]+)\]\s?\(([^\)]*)\)/) { |match|
143
+ text = match[1]
144
+ url = match[2]
145
+ url = url.strip.downcase
146
+ # create a dummy id
147
+ id="dummy_#{@refs.size}"
148
+ @refs[id] = {:url=>url}
149
+
150
+ e = create_md_element(:link, [text])
151
+ e.meta[:ref_id] = id
152
+ e
153
+ }
154
+
155
+ # Detect an email address <andrea@invalid.it>
156
+ span.map_match( /<([^:]+@[^:]+)>/) { |match|
157
+ email = match[1]
158
+ e = create_md_element(:email_address, [])
159
+ e.meta[:email] = email
160
+ e
161
+ }
162
+
163
+
164
+ # And now the easy stuff
165
+
166
+ # search for **strong**
167
+ span.match_couple_of('**') { |children| create_md_element(:strong, children) }
168
+
169
+ # search for __strong__
170
+ span.match_couple_of('__') { |children| create_md_element(:strong, children) }
171
+
172
+ # search for *emphasis*
173
+ span.match_couple_of('*') { |children| create_md_element(:emphasis, children) }
174
+
175
+ # search for _emphasis_
176
+ span.match_couple_of('_') { |children| create_md_element(:emphasis, children) }
177
+
178
+ # finally, unescape the special characters
179
+ span.replace_each_string { |s| s.unescape_md_special}
180
+
181
+ span.children
182
+ end
183
+
184
+ # returns array containing Strings or :linebreak elements
185
+ def resolve_linebreaks(lines)
186
+ res = []
187
+ s = ""
188
+ lines.each do |l|
189
+ s += (s.size>0 ? " " : "") + l.strip
190
+ if force_linebreak?(l)
191
+ res << s
192
+ res << create_md_element(:linebreak)
193
+ s = ""
194
+ end
195
+ end
196
+ res << s if s.size > 0
197
+ res
198
+ end
199
+
200
+ end
201
+
202
+ # And now the black magic that makes the part above so elegant
203
+
204
+ class MDElement
205
+
206
+ # yields to each element of specified node_type
207
+ def each_element(e_node_type, &block)
208
+ @children.each do |c|
209
+ if c.kind_of? MDElement
210
+ if (not e_node_type) || (e_node_type == c.node_type)
211
+ block.call c
212
+ end
213
+ c.each_element(e_node_type, &block)
214
+ end
215
+ end
216
+ end
217
+
218
+ # Apply passed block to each String in the hierarchy.
219
+ def replace_each_string(&block)
220
+ for c in @children
221
+ if c.kind_of? MDElement
222
+ c.replace_each_string(&block)
223
+ end
224
+ end
225
+
226
+ processed = []
227
+ until @children.empty?
228
+ c = @children.shift
229
+ if c.kind_of? String
230
+ result = block.call(c)
231
+ [*result].each do |e| processed << e end
232
+ else
233
+ processed << c
234
+ end
235
+ end
236
+ @children = processed
237
+ end
238
+
239
+ # Try to match the regexp to each string in the hierarchy
240
+ # (using `replace_each_string`). If the regexp match, eliminate
241
+ # the matching string and substitute it with the pre_match, the
242
+ # result of the block, and the post_match
243
+ #
244
+ # ..., matched_string, ... -> ..., pre_match, block.call(match), post_match
245
+ #
246
+ # the block might return arrays.
247
+ #
248
+ def map_match(regexp, &block)
249
+ replace_each_string { |s|
250
+ processed = []
251
+ while (match = regexp.match(s))
252
+ # save the pre_match
253
+ processed << match.pre_match if match.pre_match && match.pre_match.size>0
254
+ # transform match
255
+ result = block.call(match)
256
+ # and append as processed
257
+ [*result].each do |e| processed << e end
258
+ # go on with the rest of the string
259
+ s = match.post_match
260
+ end
261
+ processed << s if s.size > 0
262
+ processed
263
+ }
264
+ end
265
+
266
+ # Finds couple of delimiters in a hierarchy of Strings and MDElements
267
+ def match_couple_of(marker, &block)
268
+ regexp = Regexp.new(Regexp.escape(marker))
269
+
270
+ for c in @children; if c.kind_of? MDElement
271
+ c.match_couple_of(marker, &block)
272
+ end end
273
+
274
+ processed_children = []
275
+
276
+ until @children.empty?
277
+ c = @children.shift
278
+ if c.kind_of? String
279
+ match = regexp.match(c)
280
+ if not match
281
+ processed_children << c
282
+ else # we found opening, now search closing
283
+ # puts "Found opening (#{marker}) in #{c.inspect}"
284
+ # pre match is processed
285
+ processed_children.push match.pre_match if
286
+ match.pre_match && match.pre_match.size > 0
287
+ # we will process again the post_match
288
+ @children.unshift match.post_match if
289
+ match.post_match && match.post_match.size>0
290
+
291
+ contained = []; found_closing = false
292
+ until @children.empty? || found_closing
293
+ c = @children.shift
294
+ if c.kind_of? String
295
+ match = regexp.match(c)
296
+ if not match
297
+ contained << c
298
+ else
299
+ # we found closing
300
+ found_closing = true
301
+ # pre match is contained
302
+ contained.push match.pre_match if
303
+ match.pre_match && match.pre_match.size>0
304
+ # we will process again the post_match
305
+ @children.unshift match.post_match if
306
+ match.post_match && match.post_match.size>0
307
+
308
+ # And now we call the block
309
+ substitute = block.call(contained)
310
+ processed_children << substitute
311
+
312
+ # puts "Found closing (#{marker}) in #{c.inspect}"
313
+ # puts "Children: #{contained.inspect}"
314
+ # puts "Substitute: #{substitute.inspect}"
315
+ end
316
+ else
317
+ contained << c
318
+ end
319
+ end
320
+
321
+ if not found_closing
322
+ $stderr.puts "##### Could not find closing for #{marker}"
323
+ processed_children << "?"
324
+ contained.reverse.each do |c|
325
+ @children.unshift c
326
+ end
327
+ end
328
+ end
329
+ else
330
+ processed_children << c
331
+ end
332
+ end
333
+
334
+ @children = processed_children
335
+ end
336
+ end
@@ -0,0 +1,270 @@
1
+ class Maruku
2
+
3
+ # Split a string into lines, and chomps the newline
4
+ def split_lines(s)
5
+ a = []
6
+ s.each_line do |l|
7
+ l = l.chomp
8
+ a << l
9
+ end
10
+ a
11
+ end
12
+
13
+ ## This parses email headers. Returns an hash. hash['data'] is the message
14
+ def parse_email_headers(s)
15
+ keys={}
16
+ match = (s =~ /((\w+: .*\n)+)\n/)
17
+ if match != 0
18
+ keys[:data] = s
19
+ else
20
+ keys[:data] = $'
21
+ headers = $1
22
+ headers.split("\n").each do |l|
23
+ k, v = l.split(':')
24
+ keys[k.strip.downcase.to_sym] = v.strip
25
+ end
26
+ end
27
+ keys
28
+ end
29
+
30
+ # Returns the number of leading spaces, considering that
31
+ # a tab counts as `TabSize` spaces.
32
+ def number_of_leading_spaces(s)
33
+ n=0; i=0;
34
+ while i < s.size
35
+ c = s[i,1]
36
+ if c == ' '
37
+ i+=1; n+=1;
38
+ elsif c == "\t"
39
+ i+=1; n+=TabSize;
40
+ else
41
+ break
42
+ end
43
+ end
44
+ n
45
+ end
46
+
47
+ # This returns the position of the first real char in a list item
48
+ #
49
+ # For example:
50
+ # '*Hello' # => 1
51
+ # '* Hello' # => 2
52
+ # ' * Hello' # => 3
53
+ # ' * Hello' # => 5
54
+ # '1.Hello' # => 2
55
+ # ' 1. Hello' # => 5
56
+
57
+ def spaces_before_first_char(s)
58
+ case line_node_type(s)
59
+ when :ulist
60
+ i=0;
61
+ # skip whitespace
62
+ while s[i,1] =~ /\s/; i+=1 end
63
+ # skip indicator
64
+ i+=1
65
+ # skip whitespace
66
+ while s[i,1] =~ /\s/; i+=1 end
67
+ #
68
+ # while i < s.size
69
+ # break if not [' ',"\t",'*','-'].include? s[i,1]
70
+ # i += 1
71
+ # end
72
+ return i
73
+ when :olist
74
+ i=0;
75
+ # skip whitespace
76
+ while s[i,1] =~ /\s/; i+=1 end
77
+ # skip digits
78
+ while s[i,1] =~ /\d/; i+=1 end
79
+ # skip dot
80
+ i+=1
81
+ # skip whitespace
82
+ while s[i,1] =~ /\s/; i+=1 end
83
+ return i
84
+ end
85
+ end
86
+
87
+ # Counts the number of leading '#' in the string
88
+ def num_leading_hashes(s)
89
+ i=0;
90
+ while i<(s.size-1) && (s[i,1]=='#'); i+=1 end
91
+ i
92
+ end
93
+
94
+ # Strips initial and final hashes
95
+ def strip_hashes(s)
96
+ s = s[num_leading_hashes(s), s.size]
97
+ i = s.size-1
98
+ while i > 0 && (s[i,1] =~ /(#|\s)/); i-=1; end
99
+ s[0, i+1].strip
100
+ end
101
+
102
+
103
+ # removes initial quote
104
+ def unquote(s)
105
+ s.gsub(/^>\s?/,'')
106
+ end
107
+
108
+ # toglie al massimo n caratteri
109
+ def strip_indent(s, n)
110
+ i = 0
111
+ while i < s.size && n>0
112
+ c = s[i,1]
113
+ if c == ' '
114
+ n-=1;
115
+ elsif c == "\t"
116
+ n-=TabSize;
117
+ else
118
+ break
119
+ end
120
+ i+=1
121
+ end
122
+ s[i, s.size-1]
123
+ end
124
+
125
+
126
+ def debug(s)
127
+ $stderr.puts s
128
+ end
129
+
130
+ def dbg_describe_ary(a, prefix='')
131
+ i = 0
132
+ a.each do |l|
133
+ $stderr.puts "#{prefix} (#{i+=1})##{l}#"
134
+ end
135
+ end
136
+
137
+ def force_linebreak?(l)
138
+ l =~ / $/
139
+ end
140
+
141
+ def line_node_type(l)
142
+ # raw html is like PHP Markdown Extra: at most three spaces before
143
+ return :code if number_of_leading_spaces(l)>=4
144
+ return :footnote_text if l =~ FootnoteText
145
+ return :ref if l =~ LinkRegex or l=~ IncompleteLink
146
+ return :abbreviation if l =~ Abbreviation
147
+ return :definition if l =~ Definition
148
+ return :raw_html if l =~ %r{^[ ]?[ ]?[ ]?</?\s*\w+}
149
+ return :ulist if l =~ /^\s?(\*|-)\s+.*\w+/
150
+ return :olist if l =~ /^\s?\d\..*\w+/
151
+ return :empty if l.strip.size == 0
152
+ return :header1 if l =~ /^(=)+/
153
+ return :header2 if l =~ /^([-\s])+$/
154
+ return :header3 if l =~ /^(#)+\s*\S+/
155
+ # at least three asterisks on a line, and only whitespace
156
+ return :hrule if l =~ /^(\s*\*\s*){3,1000}$/
157
+ return :hrule if l =~ /^(\s*-\s*){3,1000}$/ # or hyphens
158
+ return :quote if l =~ /^>/
159
+ return :metadata if l =~ /^@/
160
+ return :text
161
+ end
162
+
163
+ # Example:
164
+ # ^:blah blah
165
+ # ^: blah blah
166
+ # ^ : blah blah
167
+ Definition = %r{
168
+ ^ # begin of line
169
+ [ ]{0,3} # up to 3 spaces
170
+ : # colon
171
+ \s* # whitespace
172
+ (\S.*) # the text = $1
173
+ $ # end of line
174
+ }x
175
+
176
+ # Example:
177
+ # *[HTML]: Hyper Text Markup Language
178
+ Abbreviation = %r{
179
+ ^ # begin of line
180
+ \* # one asterisk
181
+ \[ # opening bracket
182
+ ([^\]]+) # any non-closing bracket: id = $1
183
+ \] # closing bracket
184
+ : # colon
185
+ \s* # whitespace
186
+ (\S.*\S)* # definition=$2
187
+ \s* # strip this whitespace
188
+ $ # end of line
189
+ }x
190
+
191
+ FootnoteText = %r{
192
+ ^\s*\[(\^.+)\]: # id = $1 (including '^')
193
+ \s*(\S.*)?$ # text = $2 (not obb.)
194
+ }x
195
+
196
+ # This regex is taken from BlueCloth sources
197
+ # Link defs are in the form: ^[id]: \n? url "optional title"
198
+ LinkRegex = %r{
199
+ ^[ ]*\[(.+)\]: # id = $1
200
+ [ ]*
201
+ <?(\S+)>? # url = $2
202
+ [ ]*
203
+ (?:# Titles are delimited by "quotes" or (parens).
204
+ ["(']
205
+ (.+?) # title = $3
206
+ [")'] # Matching ) or "
207
+ \s*(.+)? # stuff = $4
208
+ )? # title is optional
209
+ }x
210
+
211
+ IncompleteLink = %r{^\s*\[(.+)\]:\s*$}
212
+
213
+ HeaderWithId = /^(.*)\{\#([\w_-]+)\}\s*$/
214
+
215
+ TabSize = 4;
216
+
217
+ # if contains a pipe, it could be a table header
218
+ MightBeTableHeader = %r{\|}
219
+ # -------------:
220
+ Sep = /\s*(\:)?\s*-+\s*(\:)?\s*/
221
+ # | -------------:| ------------------------------ |
222
+ TableSeparator = %r{^(\|?#{Sep}\|?)+\s*$}
223
+ end
224
+
225
+ class String
226
+ S = 240
227
+ MarkdownEscaped =
228
+ [["\\",S+0],
229
+ ['`',S+1],
230
+ ['*',S+2],
231
+ ['_',S+3],['{',S+4],['}',S+5],['[',S+6],[']',S+7],
232
+ ['(',S+8],[')',S+9],['#',S+10],['.',S+11],
233
+ ['!',S+12],
234
+ # PHP Markdown extra
235
+ ['|',S+13],[':',S+14]]
236
+
237
+ MarkdownAdd = 200
238
+
239
+
240
+ def escape_md_special!
241
+ MarkdownEscaped.each do |c|
242
+ escape_sequence = "\\#{c[0]}"
243
+ #puts "Escaping -#{escape_sequence}-"
244
+ escaped ="0"; escaped[0]=c[1]
245
+ gsub!(escape_sequence, escaped)
246
+ end
247
+
248
+ # But if you surround an * or _ with spaces,
249
+ # it’ll be treated as a literal asterisk or underscore.
250
+ gsub!(/\s\*(\s|$)/, [S+2].pack('c'))
251
+ gsub!(/\s_(\s|$)/, [S+2].pack('c'))
252
+
253
+ self
254
+ end
255
+
256
+ def unescape_md_special!
257
+ for i in 0..size-1
258
+ for e in MarkdownEscaped
259
+ if self[i] == e[1]
260
+ self[i,1] = e[0]
261
+ end
262
+ end
263
+ end
264
+ self
265
+ end
266
+
267
+ def unescape_md_special; dup.unescape_md_special! end
268
+ def escape_md_special; dup. escape_md_special! end
269
+
270
+ end