commonmarker 0.23.6 → 1.0.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +70 -212
  3. data/commonmarker.gemspec +34 -31
  4. data/ext/commonmarker/Cargo.toml +12 -0
  5. data/ext/commonmarker/_util.rb +102 -0
  6. data/ext/commonmarker/extconf.rb +4 -5
  7. data/ext/commonmarker/src/comrak_options.rs +107 -0
  8. data/ext/commonmarker/src/lib.rs +27 -0
  9. data/lib/commonmarker/config.rb +58 -37
  10. data/lib/commonmarker/extension.rb +14 -0
  11. data/lib/commonmarker/renderer.rb +1 -127
  12. data/lib/commonmarker/version.rb +2 -2
  13. data/lib/commonmarker.rb +19 -32
  14. metadata +33 -177
  15. data/Rakefile +0 -109
  16. data/bin/commonmarker +0 -118
  17. data/ext/commonmarker/arena.c +0 -103
  18. data/ext/commonmarker/autolink.c +0 -456
  19. data/ext/commonmarker/autolink.h +0 -8
  20. data/ext/commonmarker/blocks.c +0 -1596
  21. data/ext/commonmarker/buffer.c +0 -278
  22. data/ext/commonmarker/buffer.h +0 -116
  23. data/ext/commonmarker/case_fold_switch.inc +0 -4327
  24. data/ext/commonmarker/chunk.h +0 -135
  25. data/ext/commonmarker/cmark-gfm-core-extensions.h +0 -54
  26. data/ext/commonmarker/cmark-gfm-extension_api.h +0 -736
  27. data/ext/commonmarker/cmark-gfm-extensions_export.h +0 -42
  28. data/ext/commonmarker/cmark-gfm.h +0 -817
  29. data/ext/commonmarker/cmark-gfm_export.h +0 -42
  30. data/ext/commonmarker/cmark-gfm_version.h +0 -7
  31. data/ext/commonmarker/cmark.c +0 -55
  32. data/ext/commonmarker/cmark_ctype.c +0 -44
  33. data/ext/commonmarker/cmark_ctype.h +0 -33
  34. data/ext/commonmarker/commonmark.c +0 -529
  35. data/ext/commonmarker/commonmarker.c +0 -1307
  36. data/ext/commonmarker/commonmarker.h +0 -16
  37. data/ext/commonmarker/config.h +0 -76
  38. data/ext/commonmarker/core-extensions.c +0 -27
  39. data/ext/commonmarker/entities.inc +0 -2138
  40. data/ext/commonmarker/ext_scanners.c +0 -879
  41. data/ext/commonmarker/ext_scanners.h +0 -24
  42. data/ext/commonmarker/footnotes.c +0 -63
  43. data/ext/commonmarker/footnotes.h +0 -27
  44. data/ext/commonmarker/houdini.h +0 -57
  45. data/ext/commonmarker/houdini_href_e.c +0 -100
  46. data/ext/commonmarker/houdini_html_e.c +0 -66
  47. data/ext/commonmarker/houdini_html_u.c +0 -149
  48. data/ext/commonmarker/html.c +0 -486
  49. data/ext/commonmarker/html.h +0 -27
  50. data/ext/commonmarker/inlines.c +0 -1716
  51. data/ext/commonmarker/inlines.h +0 -29
  52. data/ext/commonmarker/iterator.c +0 -159
  53. data/ext/commonmarker/iterator.h +0 -26
  54. data/ext/commonmarker/latex.c +0 -466
  55. data/ext/commonmarker/linked_list.c +0 -37
  56. data/ext/commonmarker/man.c +0 -278
  57. data/ext/commonmarker/map.c +0 -122
  58. data/ext/commonmarker/map.h +0 -41
  59. data/ext/commonmarker/node.c +0 -979
  60. data/ext/commonmarker/node.h +0 -125
  61. data/ext/commonmarker/parser.h +0 -58
  62. data/ext/commonmarker/plaintext.c +0 -235
  63. data/ext/commonmarker/plugin.c +0 -36
  64. data/ext/commonmarker/plugin.h +0 -34
  65. data/ext/commonmarker/references.c +0 -42
  66. data/ext/commonmarker/references.h +0 -26
  67. data/ext/commonmarker/registry.c +0 -63
  68. data/ext/commonmarker/registry.h +0 -24
  69. data/ext/commonmarker/render.c +0 -205
  70. data/ext/commonmarker/render.h +0 -62
  71. data/ext/commonmarker/scanners.c +0 -10508
  72. data/ext/commonmarker/scanners.h +0 -62
  73. data/ext/commonmarker/scanners.re +0 -341
  74. data/ext/commonmarker/strikethrough.c +0 -167
  75. data/ext/commonmarker/strikethrough.h +0 -9
  76. data/ext/commonmarker/syntax_extension.c +0 -149
  77. data/ext/commonmarker/syntax_extension.h +0 -34
  78. data/ext/commonmarker/table.c +0 -848
  79. data/ext/commonmarker/table.h +0 -12
  80. data/ext/commonmarker/tagfilter.c +0 -60
  81. data/ext/commonmarker/tagfilter.h +0 -8
  82. data/ext/commonmarker/tasklist.c +0 -156
  83. data/ext/commonmarker/tasklist.h +0 -8
  84. data/ext/commonmarker/utf8.c +0 -317
  85. data/ext/commonmarker/utf8.h +0 -35
  86. data/ext/commonmarker/xml.c +0 -181
  87. data/lib/commonmarker/node/inspect.rb +0 -47
  88. data/lib/commonmarker/node.rb +0 -83
  89. data/lib/commonmarker/renderer/html_renderer.rb +0 -252
@@ -1,456 +0,0 @@
1
- #include "autolink.h"
2
- #include <parser.h>
3
- #include <string.h>
4
- #include <utf8.h>
5
-
6
- #if defined(_WIN32)
7
- #define strncasecmp _strnicmp
8
- #else
9
- #include <strings.h>
10
- #endif
11
-
12
- static int is_valid_hostchar(const uint8_t *link, size_t link_len) {
13
- int32_t ch;
14
- int r = cmark_utf8proc_iterate(link, (bufsize_t)link_len, &ch);
15
- if (r < 0)
16
- return 0;
17
- return !cmark_utf8proc_is_space(ch) && !cmark_utf8proc_is_punctuation(ch);
18
- }
19
-
20
- static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
21
- static const size_t valid_uris_count = 3;
22
- static const char *valid_uris[] = {"http://", "https://", "ftp://"};
23
-
24
- size_t i;
25
-
26
- for (i = 0; i < valid_uris_count; ++i) {
27
- size_t len = strlen(valid_uris[i]);
28
-
29
- if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 &&
30
- is_valid_hostchar(link + len, link_len - len))
31
- return 1;
32
- }
33
-
34
- return 0;
35
- }
36
-
37
- static size_t autolink_delim(uint8_t *data, size_t link_end) {
38
- uint8_t cclose, copen;
39
- size_t i;
40
-
41
- for (i = 0; i < link_end; ++i)
42
- if (data[i] == '<') {
43
- link_end = i;
44
- break;
45
- }
46
-
47
- while (link_end > 0) {
48
- cclose = data[link_end - 1];
49
-
50
- switch (cclose) {
51
- case ')':
52
- copen = '(';
53
- break;
54
- default:
55
- copen = 0;
56
- }
57
-
58
- if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL)
59
- link_end--;
60
-
61
- else if (data[link_end - 1] == ';') {
62
- size_t new_end = link_end - 2;
63
-
64
- while (new_end > 0 && cmark_isalpha(data[new_end]))
65
- new_end--;
66
-
67
- if (new_end < link_end - 2 && data[new_end] == '&')
68
- link_end = new_end;
69
- else
70
- link_end--;
71
- } else if (copen != 0) {
72
- size_t closing = 0;
73
- size_t opening = 0;
74
- i = 0;
75
-
76
- /* Allow any number of matching brackets (as recognised in copen/cclose)
77
- * at the end of the URL. If there is a greater number of closing
78
- * brackets than opening ones, we remove one character from the end of
79
- * the link.
80
- *
81
- * Examples (input text => output linked portion):
82
- *
83
- * http://www.pokemon.com/Pikachu_(Electric)
84
- * => http://www.pokemon.com/Pikachu_(Electric)
85
- *
86
- * http://www.pokemon.com/Pikachu_((Electric)
87
- * => http://www.pokemon.com/Pikachu_((Electric)
88
- *
89
- * http://www.pokemon.com/Pikachu_(Electric))
90
- * => http://www.pokemon.com/Pikachu_(Electric)
91
- *
92
- * http://www.pokemon.com/Pikachu_((Electric))
93
- * => http://www.pokemon.com/Pikachu_((Electric))
94
- */
95
-
96
- while (i < link_end) {
97
- if (data[i] == copen)
98
- opening++;
99
- else if (data[i] == cclose)
100
- closing++;
101
-
102
- i++;
103
- }
104
-
105
- if (closing <= opening)
106
- break;
107
-
108
- link_end--;
109
- } else
110
- break;
111
- }
112
-
113
- return link_end;
114
- }
115
-
116
- static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
117
- size_t i, np = 0, uscore1 = 0, uscore2 = 0;
118
-
119
- for (i = 1; i < size - 1; i++) {
120
- if (data[i] == '_')
121
- uscore2++;
122
- else if (data[i] == '.') {
123
- uscore1 = uscore2;
124
- uscore2 = 0;
125
- np++;
126
- } else if (!is_valid_hostchar(data + i, size - i) && data[i] != '-')
127
- break;
128
- }
129
-
130
- if (uscore1 > 0 || uscore2 > 0)
131
- return 0;
132
-
133
- if (allow_short) {
134
- /* We don't need a valid domain in the strict sense (with
135
- * least one dot; so just make sure it's composed of valid
136
- * domain characters and return the length of the the valid
137
- * sequence. */
138
- return i;
139
- } else {
140
- /* a valid domain needs to have at least a dot.
141
- * that's as far as we get */
142
- return np ? i : 0;
143
- }
144
- }
145
-
146
- static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
147
- cmark_inline_parser *inline_parser) {
148
- cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
149
- size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
150
- uint8_t *data = chunk->data + max_rewind;
151
- size_t size = chunk->len - max_rewind;
152
- int start = cmark_inline_parser_get_column(inline_parser);
153
-
154
- size_t link_end;
155
-
156
- if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL &&
157
- !cmark_isspace(data[-1]))
158
- return 0;
159
-
160
- if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
161
- return 0;
162
-
163
- link_end = check_domain(data, size, 0);
164
-
165
- if (link_end == 0)
166
- return NULL;
167
-
168
- while (link_end < size && !cmark_isspace(data[link_end]))
169
- link_end++;
170
-
171
- link_end = autolink_delim(data, link_end);
172
-
173
- if (link_end == 0)
174
- return NULL;
175
-
176
- cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end));
177
-
178
- cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
179
-
180
- cmark_strbuf buf;
181
- cmark_strbuf_init(parser->mem, &buf, 10);
182
- cmark_strbuf_puts(&buf, "http://");
183
- cmark_strbuf_put(&buf, data, (bufsize_t)link_end);
184
- node->as.link.url = cmark_chunk_buf_detach(&buf);
185
-
186
- cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
187
- text->as.literal =
188
- cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end);
189
- cmark_node_append_child(node, text);
190
-
191
- node->start_line = text->start_line =
192
- node->end_line = text->end_line =
193
- cmark_inline_parser_get_line(inline_parser);
194
-
195
- node->start_column = text->start_column = start - 1;
196
- node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1;
197
-
198
- return node;
199
- }
200
-
201
- static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
202
- cmark_inline_parser *inline_parser) {
203
- size_t link_end, domain_len;
204
- int rewind = 0;
205
-
206
- cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
207
- int max_rewind = cmark_inline_parser_get_offset(inline_parser);
208
- uint8_t *data = chunk->data + max_rewind;
209
- size_t size = chunk->len - max_rewind;
210
-
211
- if (size < 4 || data[1] != '/' || data[2] != '/')
212
- return 0;
213
-
214
- while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1]))
215
- rewind++;
216
-
217
- if (!sd_autolink_issafe(data - rewind, size + rewind))
218
- return 0;
219
-
220
- link_end = strlen("://");
221
-
222
- domain_len = check_domain(data + link_end, size - link_end, 1);
223
-
224
- if (domain_len == 0)
225
- return 0;
226
-
227
- link_end += domain_len;
228
- while (link_end < size && !cmark_isspace(data[link_end]))
229
- link_end++;
230
-
231
- link_end = autolink_delim(data, link_end);
232
-
233
- if (link_end == 0)
234
- return NULL;
235
-
236
- cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end));
237
- cmark_node_unput(parent, rewind);
238
-
239
- cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
240
-
241
- cmark_chunk url = cmark_chunk_dup(chunk, max_rewind - rewind,
242
- (bufsize_t)(link_end + rewind));
243
- node->as.link.url = url;
244
-
245
- cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
246
- text->as.literal = url;
247
- cmark_node_append_child(node, text);
248
-
249
- return node;
250
- }
251
-
252
- static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
253
- cmark_node *parent, unsigned char c,
254
- cmark_inline_parser *inline_parser) {
255
- if (cmark_inline_parser_in_bracket(inline_parser, false) ||
256
- cmark_inline_parser_in_bracket(inline_parser, true))
257
- return NULL;
258
-
259
- if (c == ':')
260
- return url_match(parser, parent, inline_parser);
261
-
262
- if (c == 'w')
263
- return www_match(parser, parent, inline_parser);
264
-
265
- return NULL;
266
-
267
- // note that we could end up re-consuming something already a
268
- // part of an inline, because we don't track when the last
269
- // inline was finished in inlines.c.
270
- }
271
-
272
- static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
273
- size_t len = strlen(protocol);
274
-
275
- // Check that the protocol matches
276
- for (int i = 1; i <= len; i++) {
277
- if (data[-rewind - i] != protocol[len - i]) {
278
- return false;
279
- }
280
- }
281
-
282
- char prev_char = data[-rewind - len - 1];
283
-
284
- // Make sure the character before the protocol is non-alphanumeric
285
- return !cmark_isalnum(prev_char);
286
- }
287
-
288
- static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
289
- // postprocess_text can recurse very deeply if there is a very long line of
290
- // '@' only. Stop at a reasonable depth to ensure it cannot crash.
291
- if (depth > 1000) return;
292
-
293
- size_t link_end;
294
- uint8_t *data = text->as.literal.data,
295
- *at;
296
- size_t size = text->as.literal.len;
297
- bool auto_mailto = true;
298
- bool is_xmpp = false;
299
- int rewind, max_rewind,
300
- nb = 0, np = 0, ns = 0;
301
-
302
- if (offset < 0 || (size_t)offset >= size)
303
- return;
304
-
305
- data += offset;
306
- size -= offset;
307
-
308
- at = (uint8_t *)memchr(data, '@', size);
309
- if (!at)
310
- return;
311
-
312
- max_rewind = (int)(at - data);
313
- data += max_rewind;
314
- size -= max_rewind;
315
-
316
- for (rewind = 0; rewind < max_rewind; ++rewind) {
317
- uint8_t c = data[-rewind - 1];
318
-
319
- if (cmark_isalnum(c))
320
- continue;
321
-
322
- if (strchr(".+-_", c) != NULL)
323
- continue;
324
-
325
- if (strchr(":", c) != NULL) {
326
- if (validate_protocol("mailto:", data, rewind)) {
327
- auto_mailto = false;
328
- continue;
329
- }
330
-
331
- if (validate_protocol("xmpp:", data, rewind)) {
332
- auto_mailto = false;
333
- is_xmpp = true;
334
- continue;
335
- }
336
- }
337
-
338
- break;
339
- }
340
-
341
- if (rewind == 0 || ns > 0) {
342
- postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
343
- return;
344
- }
345
-
346
- for (link_end = 0; link_end < size; ++link_end) {
347
- uint8_t c = data[link_end];
348
-
349
- if (cmark_isalnum(c))
350
- continue;
351
-
352
- if (c == '@')
353
- nb++;
354
- else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
355
- np++;
356
- else if (c == '/' && is_xmpp)
357
- continue;
358
- else if (c != '-' && c != '_')
359
- break;
360
- }
361
-
362
- if (link_end < 2 || nb != 1 || np == 0 ||
363
- (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
364
- postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
365
- return;
366
- }
367
-
368
- link_end = autolink_delim(data, link_end);
369
-
370
- if (link_end == 0) {
371
- postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
372
- return;
373
- }
374
-
375
- cmark_chunk_to_cstr(parser->mem, &text->as.literal);
376
-
377
- cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
378
- cmark_strbuf buf;
379
- cmark_strbuf_init(parser->mem, &buf, 10);
380
- if (auto_mailto)
381
- cmark_strbuf_puts(&buf, "mailto:");
382
- cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
383
- link_node->as.link.url = cmark_chunk_buf_detach(&buf);
384
-
385
- cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
386
- cmark_chunk email = cmark_chunk_dup(
387
- &text->as.literal,
388
- offset + max_rewind - rewind,
389
- (bufsize_t)(link_end + rewind));
390
- cmark_chunk_to_cstr(parser->mem, &email);
391
- link_text->as.literal = email;
392
- cmark_node_append_child(link_node, link_text);
393
-
394
- cmark_node_insert_after(text, link_node);
395
-
396
- cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
397
- post->as.literal = cmark_chunk_dup(&text->as.literal,
398
- (bufsize_t)(offset + max_rewind + link_end),
399
- (bufsize_t)(size - link_end));
400
- cmark_chunk_to_cstr(parser->mem, &post->as.literal);
401
-
402
- cmark_node_insert_after(link_node, post);
403
-
404
- text->as.literal.len = offset + max_rewind - rewind;
405
- text->as.literal.data[text->as.literal.len] = 0;
406
-
407
- postprocess_text(parser, post, 0, depth + 1);
408
- }
409
-
410
- static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
411
- cmark_iter *iter;
412
- cmark_event_type ev;
413
- cmark_node *node;
414
- bool in_link = false;
415
-
416
- cmark_consolidate_text_nodes(root);
417
- iter = cmark_iter_new(root);
418
-
419
- while ((ev = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
420
- node = cmark_iter_get_node(iter);
421
- if (in_link) {
422
- if (ev == CMARK_EVENT_EXIT && node->type == CMARK_NODE_LINK) {
423
- in_link = false;
424
- }
425
- continue;
426
- }
427
-
428
- if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_LINK) {
429
- in_link = true;
430
- continue;
431
- }
432
-
433
- if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
434
- postprocess_text(parser, node, 0, /*depth*/0);
435
- }
436
- }
437
-
438
- cmark_iter_free(iter);
439
-
440
- return root;
441
- }
442
-
443
- cmark_syntax_extension *create_autolink_extension(void) {
444
- cmark_syntax_extension *ext = cmark_syntax_extension_new("autolink");
445
- cmark_llist *special_chars = NULL;
446
-
447
- cmark_syntax_extension_set_match_inline_func(ext, match);
448
- cmark_syntax_extension_set_postprocess_func(ext, postprocess);
449
-
450
- cmark_mem *mem = cmark_get_default_mem_allocator();
451
- special_chars = cmark_llist_append(mem, special_chars, (void *)':');
452
- special_chars = cmark_llist_append(mem, special_chars, (void *)'w');
453
- cmark_syntax_extension_set_special_inline_chars(ext, special_chars);
454
-
455
- return ext;
456
- }
@@ -1,8 +0,0 @@
1
- #ifndef CMARK_GFM_AUTOLINK_H
2
- #define CMARK_GFM_AUTOLINK_H
3
-
4
- #include "cmark-gfm-core-extensions.h"
5
-
6
- cmark_syntax_extension *create_autolink_extension(void);
7
-
8
- #endif