qiita_marker 0.23.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +50 -0
  4. data/Rakefile +113 -0
  5. data/bin/qiita_marker +123 -0
  6. data/ext/qiita_marker/arena.c +103 -0
  7. data/ext/qiita_marker/autolink.c +425 -0
  8. data/ext/qiita_marker/autolink.h +8 -0
  9. data/ext/qiita_marker/blocks.c +1596 -0
  10. data/ext/qiita_marker/buffer.c +278 -0
  11. data/ext/qiita_marker/buffer.h +116 -0
  12. data/ext/qiita_marker/case_fold_switch.inc +4327 -0
  13. data/ext/qiita_marker/chunk.h +135 -0
  14. data/ext/qiita_marker/cmark-gfm-core-extensions.h +54 -0
  15. data/ext/qiita_marker/cmark-gfm-extension_api.h +736 -0
  16. data/ext/qiita_marker/cmark-gfm-extensions_export.h +42 -0
  17. data/ext/qiita_marker/cmark-gfm.h +817 -0
  18. data/ext/qiita_marker/cmark-gfm_export.h +42 -0
  19. data/ext/qiita_marker/cmark-gfm_version.h +7 -0
  20. data/ext/qiita_marker/cmark.c +55 -0
  21. data/ext/qiita_marker/cmark_ctype.c +44 -0
  22. data/ext/qiita_marker/cmark_ctype.h +33 -0
  23. data/ext/qiita_marker/commonmark.c +529 -0
  24. data/ext/qiita_marker/config.h +76 -0
  25. data/ext/qiita_marker/core-extensions.c +27 -0
  26. data/ext/qiita_marker/entities.inc +2138 -0
  27. data/ext/qiita_marker/ext_scanners.c +879 -0
  28. data/ext/qiita_marker/ext_scanners.h +24 -0
  29. data/ext/qiita_marker/extconf.rb +7 -0
  30. data/ext/qiita_marker/footnotes.c +63 -0
  31. data/ext/qiita_marker/footnotes.h +27 -0
  32. data/ext/qiita_marker/houdini.h +57 -0
  33. data/ext/qiita_marker/houdini_href_e.c +100 -0
  34. data/ext/qiita_marker/houdini_html_e.c +66 -0
  35. data/ext/qiita_marker/houdini_html_u.c +149 -0
  36. data/ext/qiita_marker/html.c +486 -0
  37. data/ext/qiita_marker/html.h +27 -0
  38. data/ext/qiita_marker/inlines.c +1691 -0
  39. data/ext/qiita_marker/inlines.h +29 -0
  40. data/ext/qiita_marker/iterator.c +159 -0
  41. data/ext/qiita_marker/iterator.h +26 -0
  42. data/ext/qiita_marker/latex.c +466 -0
  43. data/ext/qiita_marker/linked_list.c +37 -0
  44. data/ext/qiita_marker/man.c +278 -0
  45. data/ext/qiita_marker/map.c +122 -0
  46. data/ext/qiita_marker/map.h +41 -0
  47. data/ext/qiita_marker/node.c +979 -0
  48. data/ext/qiita_marker/node.h +125 -0
  49. data/ext/qiita_marker/parser.h +58 -0
  50. data/ext/qiita_marker/plaintext.c +235 -0
  51. data/ext/qiita_marker/plugin.c +36 -0
  52. data/ext/qiita_marker/plugin.h +34 -0
  53. data/ext/qiita_marker/qiita_marker.c +1321 -0
  54. data/ext/qiita_marker/qiita_marker.h +16 -0
  55. data/ext/qiita_marker/references.c +42 -0
  56. data/ext/qiita_marker/references.h +26 -0
  57. data/ext/qiita_marker/registry.c +63 -0
  58. data/ext/qiita_marker/registry.h +24 -0
  59. data/ext/qiita_marker/render.c +205 -0
  60. data/ext/qiita_marker/render.h +62 -0
  61. data/ext/qiita_marker/scanners.c +10520 -0
  62. data/ext/qiita_marker/scanners.h +62 -0
  63. data/ext/qiita_marker/scanners.re +341 -0
  64. data/ext/qiita_marker/strikethrough.c +167 -0
  65. data/ext/qiita_marker/strikethrough.h +9 -0
  66. data/ext/qiita_marker/syntax_extension.c +149 -0
  67. data/ext/qiita_marker/syntax_extension.h +34 -0
  68. data/ext/qiita_marker/table.c +822 -0
  69. data/ext/qiita_marker/table.h +12 -0
  70. data/ext/qiita_marker/tagfilter.c +60 -0
  71. data/ext/qiita_marker/tagfilter.h +8 -0
  72. data/ext/qiita_marker/tasklist.c +156 -0
  73. data/ext/qiita_marker/tasklist.h +8 -0
  74. data/ext/qiita_marker/utf8.c +317 -0
  75. data/ext/qiita_marker/utf8.h +35 -0
  76. data/ext/qiita_marker/xml.c +181 -0
  77. data/lib/qiita_marker/config.rb +52 -0
  78. data/lib/qiita_marker/node/inspect.rb +57 -0
  79. data/lib/qiita_marker/node.rb +83 -0
  80. data/lib/qiita_marker/renderer/html_renderer.rb +252 -0
  81. data/lib/qiita_marker/renderer.rb +135 -0
  82. data/lib/qiita_marker/version.rb +5 -0
  83. data/lib/qiita_marker.rb +45 -0
  84. data/qiita_marker.gemspec +40 -0
  85. data/test/benchmark.rb +32 -0
  86. data/test/fixtures/curly.md +1 -0
  87. data/test/fixtures/dingus.md +10 -0
  88. data/test/fixtures/strong.md +1 -0
  89. data/test/fixtures/table.md +10 -0
  90. data/test/test_attributes.rb +24 -0
  91. data/test/test_basics.rb +35 -0
  92. data/test/test_commands.rb +72 -0
  93. data/test/test_commonmark.rb +36 -0
  94. data/test/test_doc.rb +130 -0
  95. data/test/test_encoding.rb +23 -0
  96. data/test/test_extensions.rb +116 -0
  97. data/test/test_footnotes.rb +60 -0
  98. data/test/test_gc.rb +47 -0
  99. data/test/test_helper.rb +71 -0
  100. data/test/test_linebreaks.rb +15 -0
  101. data/test/test_maliciousness.rb +262 -0
  102. data/test/test_node.rb +89 -0
  103. data/test/test_options.rb +37 -0
  104. data/test/test_pathological_inputs.rb +94 -0
  105. data/test/test_plaintext.rb +46 -0
  106. data/test/test_renderer.rb +47 -0
  107. data/test/test_smartpunct.rb +27 -0
  108. data/test/test_spec.rb +30 -0
  109. data/test/test_tasklists.rb +43 -0
  110. data/test/test_xml.rb +107 -0
  111. metadata +313 -0
@@ -0,0 +1,425 @@
1
+ #include "autolink.h"
2
+ #include <parser.h>
3
+ #include <string.h>
4
+ #include <utf8.h>
5
+
6
+ #if defined(_WIN32)
7
+ #define strncasecmp _strnicmp
8
+ #else
9
+ #include <strings.h>
10
+ #endif
11
+
12
+ static int is_valid_hostchar(const uint8_t *link, size_t link_len) {
13
+ int32_t ch;
14
+ int r = cmark_utf8proc_iterate(link, (bufsize_t)link_len, &ch);
15
+ if (r < 0)
16
+ return 0;
17
+ return !cmark_utf8proc_is_space(ch) && !cmark_utf8proc_is_punctuation(ch);
18
+ }
19
+
20
+ static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
21
+ static const size_t valid_uris_count = 3;
22
+ static const char *valid_uris[] = {"http://", "https://", "ftp://"};
23
+
24
+ size_t i;
25
+
26
+ for (i = 0; i < valid_uris_count; ++i) {
27
+ size_t len = strlen(valid_uris[i]);
28
+
29
+ if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 &&
30
+ is_valid_hostchar(link + len, link_len - len))
31
+ return 1;
32
+ }
33
+
34
+ return 0;
35
+ }
36
+
37
+ static size_t autolink_delim(uint8_t *data, size_t link_end) {
38
+ uint8_t cclose, copen;
39
+ size_t i;
40
+
41
+ for (i = 0; i < link_end; ++i)
42
+ if (data[i] == '<') {
43
+ link_end = i;
44
+ break;
45
+ }
46
+
47
+ while (link_end > 0) {
48
+ cclose = data[link_end - 1];
49
+
50
+ switch (cclose) {
51
+ case ')':
52
+ copen = '(';
53
+ break;
54
+ default:
55
+ copen = 0;
56
+ }
57
+
58
+ if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL)
59
+ link_end--;
60
+
61
+ else if (data[link_end - 1] == ';') {
62
+ size_t new_end = link_end - 2;
63
+
64
+ while (new_end > 0 && cmark_isalpha(data[new_end]))
65
+ new_end--;
66
+
67
+ if (new_end < link_end - 2 && data[new_end] == '&')
68
+ link_end = new_end;
69
+ else
70
+ link_end--;
71
+ } else if (copen != 0) {
72
+ size_t closing = 0;
73
+ size_t opening = 0;
74
+ i = 0;
75
+
76
+ /* Allow any number of matching brackets (as recognised in copen/cclose)
77
+ * at the end of the URL. If there is a greater number of closing
78
+ * brackets than opening ones, we remove one character from the end of
79
+ * the link.
80
+ *
81
+ * Examples (input text => output linked portion):
82
+ *
83
+ * http://www.pokemon.com/Pikachu_(Electric)
84
+ * => http://www.pokemon.com/Pikachu_(Electric)
85
+ *
86
+ * http://www.pokemon.com/Pikachu_((Electric)
87
+ * => http://www.pokemon.com/Pikachu_((Electric)
88
+ *
89
+ * http://www.pokemon.com/Pikachu_(Electric))
90
+ * => http://www.pokemon.com/Pikachu_(Electric)
91
+ *
92
+ * http://www.pokemon.com/Pikachu_((Electric))
93
+ * => http://www.pokemon.com/Pikachu_((Electric))
94
+ */
95
+
96
+ while (i < link_end) {
97
+ if (data[i] == copen)
98
+ opening++;
99
+ else if (data[i] == cclose)
100
+ closing++;
101
+
102
+ i++;
103
+ }
104
+
105
+ if (closing <= opening)
106
+ break;
107
+
108
+ link_end--;
109
+ } else
110
+ break;
111
+ }
112
+
113
+ return link_end;
114
+ }
115
+
116
+ static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
117
+ size_t i, np = 0, uscore1 = 0, uscore2 = 0;
118
+
119
+ for (i = 1; i < size - 1; i++) {
120
+ if (data[i] == '_')
121
+ uscore2++;
122
+ else if (data[i] == '.') {
123
+ uscore1 = uscore2;
124
+ uscore2 = 0;
125
+ np++;
126
+ } else if (!is_valid_hostchar(data + i, size - i) && data[i] != '-')
127
+ break;
128
+ }
129
+
130
+ if (uscore1 > 0 || uscore2 > 0)
131
+ return 0;
132
+
133
+ if (allow_short) {
134
+ /* We don't need a valid domain in the strict sense (with
135
+ * least one dot; so just make sure it's composed of valid
136
+ * domain characters and return the length of the the valid
137
+ * sequence. */
138
+ return i;
139
+ } else {
140
+ /* a valid domain needs to have at least a dot.
141
+ * that's as far as we get */
142
+ return np ? i : 0;
143
+ }
144
+ }
145
+
146
+ static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
147
+ cmark_inline_parser *inline_parser) {
148
+ cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
149
+ size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
150
+ uint8_t *data = chunk->data + max_rewind;
151
+ size_t size = chunk->len - max_rewind;
152
+ int start = cmark_inline_parser_get_column(inline_parser);
153
+
154
+ size_t link_end;
155
+
156
+ if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL &&
157
+ !cmark_isspace(data[-1]))
158
+ return 0;
159
+
160
+ if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
161
+ return 0;
162
+
163
+ link_end = check_domain(data, size, 0);
164
+
165
+ if (link_end == 0)
166
+ return NULL;
167
+
168
+ while (link_end < size && !cmark_isspace(data[link_end]))
169
+ link_end++;
170
+
171
+ link_end = autolink_delim(data, link_end);
172
+
173
+ if (link_end == 0)
174
+ return NULL;
175
+
176
+ cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end));
177
+
178
+ cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
179
+
180
+ cmark_strbuf buf;
181
+ cmark_strbuf_init(parser->mem, &buf, 10);
182
+ cmark_strbuf_puts(&buf, "http://");
183
+ cmark_strbuf_put(&buf, data, (bufsize_t)link_end);
184
+ node->as.link.url = cmark_chunk_buf_detach(&buf);
185
+
186
+ cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
187
+ text->as.literal =
188
+ cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end);
189
+ cmark_node_append_child(node, text);
190
+
191
+ node->start_line = text->start_line =
192
+ node->end_line = text->end_line =
193
+ cmark_inline_parser_get_line(inline_parser);
194
+
195
+ node->start_column = text->start_column = start - 1;
196
+ node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1;
197
+
198
+ return node;
199
+ }
200
+
201
+ static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
202
+ cmark_inline_parser *inline_parser) {
203
+ size_t link_end, domain_len;
204
+ int rewind = 0;
205
+
206
+ cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
207
+ int max_rewind = cmark_inline_parser_get_offset(inline_parser);
208
+ uint8_t *data = chunk->data + max_rewind;
209
+ size_t size = chunk->len - max_rewind;
210
+
211
+ if (size < 4 || data[1] != '/' || data[2] != '/')
212
+ return 0;
213
+
214
+ while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1]))
215
+ rewind++;
216
+
217
+ if (!sd_autolink_issafe(data - rewind, size + rewind))
218
+ return 0;
219
+
220
+ link_end = strlen("://");
221
+
222
+ domain_len = check_domain(data + link_end, size - link_end, 1);
223
+
224
+ if (domain_len == 0)
225
+ return 0;
226
+
227
+ link_end += domain_len;
228
+ while (link_end < size && !cmark_isspace(data[link_end]))
229
+ link_end++;
230
+
231
+ link_end = autolink_delim(data, link_end);
232
+
233
+ if (link_end == 0)
234
+ return NULL;
235
+
236
+ cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end));
237
+ cmark_node_unput(parent, rewind);
238
+
239
+ cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
240
+
241
+ cmark_chunk url = cmark_chunk_dup(chunk, max_rewind - rewind,
242
+ (bufsize_t)(link_end + rewind));
243
+ node->as.link.url = url;
244
+
245
+ cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
246
+ text->as.literal = url;
247
+ cmark_node_append_child(node, text);
248
+
249
+ return node;
250
+ }
251
+
252
+ static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
253
+ cmark_node *parent, unsigned char c,
254
+ cmark_inline_parser *inline_parser) {
255
+ if (cmark_inline_parser_in_bracket(inline_parser, false) ||
256
+ cmark_inline_parser_in_bracket(inline_parser, true))
257
+ return NULL;
258
+
259
+ if (c == ':')
260
+ return url_match(parser, parent, inline_parser);
261
+
262
+ if (c == 'w')
263
+ return www_match(parser, parent, inline_parser);
264
+
265
+ return NULL;
266
+
267
+ // note that we could end up re-consuming something already a
268
+ // part of an inline, because we don't track when the last
269
+ // inline was finished in inlines.c.
270
+ }
271
+
272
+ static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
273
+ // postprocess_text can recurse very deeply if there is a very long line of
274
+ // '@' only. Stop at a reasonable depth to ensure it cannot crash.
275
+ if (depth > 1000) return;
276
+
277
+ size_t link_end;
278
+ uint8_t *data = text->as.literal.data,
279
+ *at;
280
+ size_t size = text->as.literal.len;
281
+ int rewind, max_rewind,
282
+ nb = 0, np = 0, ns = 0;
283
+
284
+ if (offset < 0 || (size_t)offset >= size)
285
+ return;
286
+
287
+ data += offset;
288
+ size -= offset;
289
+
290
+ at = (uint8_t *)memchr(data, '@', size);
291
+ if (!at)
292
+ return;
293
+
294
+ max_rewind = (int)(at - data);
295
+ data += max_rewind;
296
+ size -= max_rewind;
297
+
298
+ for (rewind = 0; rewind < max_rewind; ++rewind) {
299
+ uint8_t c = data[-rewind - 1];
300
+
301
+ if (cmark_isalnum(c))
302
+ continue;
303
+
304
+ if (strchr(".+-_", c) != NULL)
305
+ continue;
306
+
307
+ if (c == '/')
308
+ ns++;
309
+
310
+ break;
311
+ }
312
+
313
+ if (rewind == 0 || ns > 0) {
314
+ postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
315
+ return;
316
+ }
317
+
318
+ for (link_end = 0; link_end < size; ++link_end) {
319
+ uint8_t c = data[link_end];
320
+
321
+ if (cmark_isalnum(c))
322
+ continue;
323
+
324
+ if (c == '@')
325
+ nb++;
326
+ else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
327
+ np++;
328
+ else if (c != '-' && c != '_')
329
+ break;
330
+ }
331
+
332
+ if (link_end < 2 || nb != 1 || np == 0 ||
333
+ (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
334
+ postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
335
+ return;
336
+ }
337
+
338
+ link_end = autolink_delim(data, link_end);
339
+
340
+ if (link_end == 0) {
341
+ postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
342
+ return;
343
+ }
344
+
345
+ cmark_chunk_to_cstr(parser->mem, &text->as.literal);
346
+
347
+ cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
348
+ cmark_strbuf buf;
349
+ cmark_strbuf_init(parser->mem, &buf, 10);
350
+ cmark_strbuf_puts(&buf, "mailto:");
351
+ cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
352
+ link_node->as.link.url = cmark_chunk_buf_detach(&buf);
353
+
354
+ cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
355
+ cmark_chunk email = cmark_chunk_dup(
356
+ &text->as.literal,
357
+ offset + max_rewind - rewind,
358
+ (bufsize_t)(link_end + rewind));
359
+ cmark_chunk_to_cstr(parser->mem, &email);
360
+ link_text->as.literal = email;
361
+ cmark_node_append_child(link_node, link_text);
362
+
363
+ cmark_node_insert_after(text, link_node);
364
+
365
+ cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
366
+ post->as.literal = cmark_chunk_dup(&text->as.literal,
367
+ (bufsize_t)(offset + max_rewind + link_end),
368
+ (bufsize_t)(size - link_end));
369
+ cmark_chunk_to_cstr(parser->mem, &post->as.literal);
370
+
371
+ cmark_node_insert_after(link_node, post);
372
+
373
+ text->as.literal.len = offset + max_rewind - rewind;
374
+ text->as.literal.data[text->as.literal.len] = 0;
375
+
376
+ postprocess_text(parser, post, 0, depth + 1);
377
+ }
378
+
379
+ static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
380
+ cmark_iter *iter;
381
+ cmark_event_type ev;
382
+ cmark_node *node;
383
+ bool in_link = false;
384
+
385
+ cmark_consolidate_text_nodes(root);
386
+ iter = cmark_iter_new(root);
387
+
388
+ while ((ev = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
389
+ node = cmark_iter_get_node(iter);
390
+ if (in_link) {
391
+ if (ev == CMARK_EVENT_EXIT && node->type == CMARK_NODE_LINK) {
392
+ in_link = false;
393
+ }
394
+ continue;
395
+ }
396
+
397
+ if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_LINK) {
398
+ in_link = true;
399
+ continue;
400
+ }
401
+
402
+ if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
403
+ postprocess_text(parser, node, 0, /*depth*/0);
404
+ }
405
+ }
406
+
407
+ cmark_iter_free(iter);
408
+
409
+ return root;
410
+ }
411
+
412
+ cmark_syntax_extension *create_autolink_extension(void) {
413
+ cmark_syntax_extension *ext = cmark_syntax_extension_new("autolink");
414
+ cmark_llist *special_chars = NULL;
415
+
416
+ cmark_syntax_extension_set_match_inline_func(ext, match);
417
+ cmark_syntax_extension_set_postprocess_func(ext, postprocess);
418
+
419
+ cmark_mem *mem = cmark_get_default_mem_allocator();
420
+ special_chars = cmark_llist_append(mem, special_chars, (void *)':');
421
+ special_chars = cmark_llist_append(mem, special_chars, (void *)'w');
422
+ cmark_syntax_extension_set_special_inline_chars(ext, special_chars);
423
+
424
+ return ext;
425
+ }
@@ -0,0 +1,8 @@
1
+ #ifndef CMARK_GFM_AUTOLINK_H
2
+ #define CMARK_GFM_AUTOLINK_H
3
+
4
+ #include "cmark-gfm-core-extensions.h"
5
+
6
+ cmark_syntax_extension *create_autolink_extension(void);
7
+
8
+ #endif