RubyGems - commonmarker - Versions diffs - 0.23.6 → 0.23.7 - Mend

commonmarker 0.23.6 → 0.23.7

Potentially problematic release.

This version of commonmarker might be problematic. Click here for more details.

Files changed (22) hide show

checksums.yaml +4 -4
data/ext/commonmarker/arena.c +8 -7
data/ext/commonmarker/autolink.c +202 -155
data/ext/commonmarker/blocks.c +14 -0
data/ext/commonmarker/cmark-gfm-extension_api.h +1 -0
data/ext/commonmarker/commonmarker.c +1 -0
data/ext/commonmarker/inlines.c +130 -58
data/ext/commonmarker/map.c +11 -4
data/ext/commonmarker/map.h +5 -2
data/ext/commonmarker/node.c +34 -0
data/ext/commonmarker/node.h +26 -6
data/ext/commonmarker/parser.h +1 -0
data/ext/commonmarker/references.c +1 -0
data/ext/commonmarker/scanners.c +13916 -10368
data/ext/commonmarker/scanners.h +8 -0
data/ext/commonmarker/strikethrough.c +1 -1
data/ext/commonmarker/table.c +59 -35
data/lib/commonmarker/config.rb +15 -13
data/lib/commonmarker/renderer.rb +1 -1
data/lib/commonmarker/version.rb +1 -1
data/lib/commonmarker.rb +27 -25
metadata +7 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cf9a7803972a4a9111e93837c8e47265ba83f10b3abd5cff73ec7375d862ef28
-  data.tar.gz: a292683b676b06e8cb4a190e5bb60c5bc3e474c5d3b631701f7fab1f176d9a5f
+  metadata.gz: 9bc9c02a66017c3e576e64845a18440060df1f7e0082c192b51e3c7ebfd1413e
+  data.tar.gz: b7623cfa01c89817fef3f4ba850fb992d27876e7c4d2e3d9e73f085c78c3d3da
 SHA512:
-  metadata.gz: '0884ee35781e71e96cbe7b87c4d8e885b13882e8b67fc594fe53a615410a00ed4599d7a99eab9347ca8f272fd93be31b06bea890a7b0c41a1a1407a717e4f3d2'
-  data.tar.gz: ee9024ba7ee8b0143185ab41466b00b51349b55b70e12443ee11d68e640591ddab0e8073481e4cc19818fa56f82e1efbd29eaf0690203b4192b9602525169d06
+  metadata.gz: 3f11b3d05d2f7b9c43499c5b4aa868b82263cbc95119106697653300e65fd0a6018ad71584adec7d8b95e9c6a27da39f13c6fbf0cdbfd8185519783f1f8b1bbd
+  data.tar.gz: c9b318e34f244e6cadfecf746dfc61be16dc13965c3cd7add4c0d0ed587a3aff0f9da79e2fb0a1a8aa311ccddd47532924cbf1cfd95e371b33c1d5e2ad9ee563

data/ext/commonmarker/arena.c CHANGED Viewed

@@ -68,15 +68,16 @@ static void *arena_calloc(size_t nmem, size_t size) {
   const size_t align = sizeof(size_t) - 1;
   sz = (sz + align) & ~align;
+  struct arena_chunk *chunk;
   if (sz > A->sz) {
-    A->prev = alloc_arena_chunk(sz, A->prev);
-    return (uint8_t *) A->prev->ptr + sizeof(size_t);
+    A->prev = chunk = alloc_arena_chunk(sz, A->prev);
+  } else if (sz > A->sz - A->used) {
+    A = chunk = alloc_arena_chunk(A->sz + A->sz / 2, A);
+  } else {
+    chunk = A;
   }
-  if (sz > A->sz - A->used) {
-    A = alloc_arena_chunk(A->sz + A->sz / 2, A);
-  }
-  void *ptr = (uint8_t *) A->ptr + A->used;
-  A->used += sz;
+  void *ptr = (uint8_t *) chunk->ptr + chunk->used;
+  chunk->used += sz;
   *((size_t *) ptr) = sz - sizeof(size_t);
   return (uint8_t *) ptr + sizeof(size_t);
 }

data/ext/commonmarker/autolink.c CHANGED Viewed

@@ -2,6 +2,7 @@
 #include <parser.h>
 #include <string.h>
 #include <utf8.h>
+#include <stddef.h>
 #if defined(_WIN32)
 #define strncasecmp _strnicmp
@@ -35,44 +36,25 @@ static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
 }
 static size_t autolink_delim(uint8_t *data, size_t link_end) {
-  uint8_t cclose, copen;
   size_t i;
+  size_t closing = 0;
+  size_t opening = 0;
-  for (i = 0; i < link_end; ++i)
-    if (data[i] == '<') {
+  for (i = 0; i < link_end; ++i) {
+    const uint8_t c = data[i];
+    if (c == '<') {
       link_end = i;
       break;
+    } else if (c == '(') {
+      opening++;
+    } else if (c == ')') {
+      closing++;
     }
+  }
   while (link_end > 0) {
-    cclose = data[link_end - 1];
-    switch (cclose) {
+    switch (data[link_end - 1]) {
     case ')':
-      copen = '(';
-      break;
-    default:
-      copen = 0;
-    }
-    if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL)
-      link_end--;
-    else if (data[link_end - 1] == ';') {
-      size_t new_end = link_end - 2;
-      while (new_end > 0 && cmark_isalpha(data[new_end]))
-        new_end--;
-      if (new_end < link_end - 2 && data[new_end] == '&')
-        link_end = new_end;
-      else
-        link_end--;
-    } else if (copen != 0) {
-      size_t closing = 0;
-      size_t opening = 0;
-      i = 0;
       /* Allow any number of matching brackets (as recognised in copen/cclose)
        * at the end of the URL.  If there is a greater number of closing
        * brackets than opening ones, we remove one character from the end of
@@ -80,34 +62,52 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
        *
        * Examples (input text => output linked portion):
        *
-       *	http://www.pokemon.com/Pikachu_(Electric)
-       *		=> http://www.pokemon.com/Pikachu_(Electric)
+       *        http://www.pokemon.com/Pikachu_(Electric)
+       *                => http://www.pokemon.com/Pikachu_(Electric)
        *
-       *	http://www.pokemon.com/Pikachu_((Electric)
-       *		=> http://www.pokemon.com/Pikachu_((Electric)
+       *        http://www.pokemon.com/Pikachu_((Electric)
+       *                => http://www.pokemon.com/Pikachu_((Electric)
        *
-       *	http://www.pokemon.com/Pikachu_(Electric))
-       *		=> http://www.pokemon.com/Pikachu_(Electric)
+       *        http://www.pokemon.com/Pikachu_(Electric))
+       *                => http://www.pokemon.com/Pikachu_(Electric)
        *
-       *	http://www.pokemon.com/Pikachu_((Electric))
-       *		=> http://www.pokemon.com/Pikachu_((Electric))
+       *        http://www.pokemon.com/Pikachu_((Electric))
+       *                => http://www.pokemon.com/Pikachu_((Electric))
        */
-      while (i < link_end) {
-        if (data[i] == copen)
-          opening++;
-        else if (data[i] == cclose)
-          closing++;
-        i++;
+      if (closing <= opening) {
+        return link_end;
       }
+      closing--;
+      link_end--;
+      break;
+    case '?':
+    case '!':
+    case '.':
+    case ',':
+    case ':':
+    case '*':
+    case '_':
+    case '~':
+    case '\'':
+    case '"':
+      link_end--;
+      break;
+    case ';': {
+      size_t new_end = link_end - 2;
-      if (closing <= opening)
-        break;
+      while (new_end > 0 && cmark_isalpha(data[new_end]))
+        new_end--;
-      link_end--;
-    } else
+      if (new_end < link_end - 2 && data[new_end] == '&')
+        link_end = new_end;
+      else
+        link_end--;
       break;
+    }
+    default:
+      return link_end;
+    }
   }
   return link_end;
@@ -116,7 +116,20 @@ static size_t autolink_delim(uint8_t *data, size_t link_end) {
 static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
   size_t i, np = 0, uscore1 = 0, uscore2 = 0;
+  /* The purpose of this code is to reject urls that contain an underscore
+   * in one of the last two segments. Examples:
+   *
+   *   www.xxx.yyy.zzz     autolinked
+   *   www.xxx.yyy._zzz    not autolinked
+   *   www.xxx._yyy.zzz    not autolinked
+   *   www._xxx.yyy.zzz    autolinked
+   *
+   * The reason is that domain names are allowed to include underscores,
+   * but host names are not. See: https://stackoverflow.com/a/2183140
+   */
   for (i = 1; i < size - 1; i++) {
+    if (data[i] == '\\' && i < size - 2)
+      i++;
     if (data[i] == '_')
       uscore2++;
     else if (data[i] == '.') {
@@ -127,8 +140,17 @@ static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
       break;
   }
-  if (uscore1 > 0 || uscore2 > 0)
-    return 0;
+  if (uscore1 > 0 || uscore2 > 0) {
+    /* If the url is very long then accept it despite the underscores,
+     * to avoid quadratic behavior causing a denial of service. See:
+     * https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c
+     * Reasonable urls are unlikely to have more than 10 segments, so
+     * this extra condition shouldn't have any impact on normal usage.
+     */
+    if (np <= 10) {
+      return 0;
+    }
+  }
   if (allow_short) {
     /* We don't need a valid domain in the strict sense (with
@@ -165,7 +187,7 @@ static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
   if (link_end == 0)
     return NULL;
-  while (link_end < size && !cmark_isspace(data[link_end]))
+  while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
     link_end++;
   link_end = autolink_delim(data, link_end);
@@ -225,7 +247,7 @@ static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
     return 0;
   link_end += domain_len;
-  while (link_end < size && !cmark_isspace(data[link_end]))
+  while (link_end < size && !cmark_isspace(data[link_end]) && data[link_end] != '<')
     link_end++;
   link_end = autolink_delim(data, link_end);
@@ -269,142 +291,167 @@ static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
   // inline was finished in inlines.c.
 }
-static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
+static bool validate_protocol(char protocol[], uint8_t *data, size_t rewind, size_t max_rewind) {
   size_t len = strlen(protocol);
+  if (len > (max_rewind - rewind)) {
+    return false;
+  }
   // Check that the protocol matches
-  for (int i = 1; i <= len; i++) {
-    if (data[-rewind - i] != protocol[len - i]) {
-      return false;
-    }
+  if (memcmp(data - rewind - len, protocol, len) != 0) {
+    return false;
+  }
+  if (len == (max_rewind - rewind)) {
+    return true;
   }
-  char prev_char = data[-rewind - len - 1];
+  char prev_char = data[-((ptrdiff_t)rewind) - len - 1];
   // Make sure the character before the protocol is non-alphanumeric
   return !cmark_isalnum(prev_char);
 }
-static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
-  // postprocess_text can recurse very deeply if there is a very long line of
-  // '@' only.  Stop at a reasonable depth to ensure it cannot crash.
-  if (depth > 1000) return;
+static void postprocess_text(cmark_parser *parser, cmark_node *text) {
+  size_t start = 0;
+  size_t offset = 0;
+  // `text` is going to be split into a list of nodes containing shorter segments
+  // of text, so we detach the memory buffer from text and use `cmark_chunk_dup` to
+  // create references to it. Later, `cmark_chunk_to_cstr` is used to convert
+  // the references into allocated buffers. The detached buffer is freed before we
+  // return.
+  cmark_chunk detached_chunk = text->as.literal;
+  text->as.literal = cmark_chunk_dup(&detached_chunk, 0, detached_chunk.len);
+  uint8_t *data = text->as.literal.data;
+  size_t remaining = text->as.literal.len;
+  while (true) {
+    size_t link_end;
+    uint8_t *at;
+    bool auto_mailto = true;
+    bool is_xmpp = false;
+    size_t rewind;
+    size_t max_rewind;
+    size_t np = 0;
+    if (offset >= remaining)
+      break;
-  size_t link_end;
-  uint8_t *data = text->as.literal.data,
-    *at;
-  size_t size = text->as.literal.len;
-  bool auto_mailto = true;
-  bool is_xmpp = false;
-  int rewind, max_rewind,
-      nb = 0, np = 0, ns = 0;
+    at = (uint8_t *)memchr(data + start + offset, '@', remaining - offset);
+    if (!at)
+      break;
-  if (offset < 0 || (size_t)offset >= size)
-    return;
+    max_rewind = at - (data + start + offset);
-  data += offset;
-  size -= offset;
+found_at:
+    for (rewind = 0; rewind < max_rewind; ++rewind) {
+      uint8_t c = data[start + offset + max_rewind - rewind - 1];
-  at = (uint8_t *)memchr(data, '@', size);
-  if (!at)
-    return;
+      if (cmark_isalnum(c))
+        continue;
-  max_rewind = (int)(at - data);
-  data += max_rewind;
-  size -= max_rewind;
+      if (strchr(".+-_", c) != NULL)
+        continue;
-  for (rewind = 0; rewind < max_rewind; ++rewind) {
-    uint8_t c = data[-rewind - 1];
+      if (strchr(":", c) != NULL) {
+        if (validate_protocol("mailto:", data + start + offset + max_rewind, rewind, max_rewind)) {
+          auto_mailto = false;
+          continue;
+        }
+        if (validate_protocol("xmpp:", data + start + offset + max_rewind, rewind, max_rewind)) {
+          auto_mailto = false;
+          is_xmpp = true;
+          continue;
+        }
+      }
-    if (cmark_isalnum(c))
-      continue;
+      break;
+    }
-    if (strchr(".+-_", c) != NULL)
+    if (rewind == 0) {
+      offset += max_rewind + 1;
       continue;
+    }
-    if (strchr(":", c) != NULL) {
-      if (validate_protocol("mailto:", data, rewind)) {
-        auto_mailto = false;
+    assert(data[start + offset + max_rewind] == '@');
+    for (link_end = 1; link_end < remaining - offset - max_rewind; ++link_end) {
+      uint8_t c = data[start + offset + max_rewind + link_end];
+      if (cmark_isalnum(c))
         continue;
-      }
-      if (validate_protocol("xmpp:", data, rewind)) {
-        auto_mailto = false;
-        is_xmpp = true;
+      if (c == '@') {
+        // Found another '@', so go back and try again with an updated offset and max_rewind.
+        offset += max_rewind + 1;
+        max_rewind = link_end - 1;
+        goto found_at;
+      } else if (c == '.' && link_end < remaining - offset - max_rewind - 1 &&
+               cmark_isalnum(data[start + offset + max_rewind + link_end + 1]))
+        np++;
+      else if (c == '/' && is_xmpp)
         continue;
-      }
+      else if (c != '-' && c != '_')
+        break;
     }
-    break;
-  }
-  if (rewind == 0 || ns > 0) {
-    postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
-    return;
-  }
-  for (link_end = 0; link_end < size; ++link_end) {
-    uint8_t c = data[link_end];
-    if (cmark_isalnum(c))
-      continue;
-    if (c == '@')
-      nb++;
-    else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
-      np++;
-    else if (c == '/' && is_xmpp)
+    if (link_end < 2 || np == 0 ||
+        (!cmark_isalpha(data[start + offset + max_rewind + link_end - 1]) &&
+         data[start + offset + max_rewind + link_end - 1] != '.')) {
+      offset += max_rewind + link_end;
       continue;
-    else if (c != '-' && c != '_')
-      break;
-  }
+    }
-  if (link_end < 2 || nb != 1 || np == 0 ||
-      (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
-    postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
-    return;
-  }
+    link_end = autolink_delim(data + start + offset + max_rewind, link_end);
-  link_end = autolink_delim(data, link_end);
+    if (link_end == 0) {
+      offset += max_rewind + 1;
+      continue;
+    }
-  if (link_end == 0) {
-    postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
-    return;
-  }
+    cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
+    cmark_strbuf buf;
+    cmark_strbuf_init(parser->mem, &buf, 10);
+    if (auto_mailto)
+      cmark_strbuf_puts(&buf, "mailto:");
+    cmark_strbuf_put(&buf, data + start + offset + max_rewind - rewind, (bufsize_t)(link_end + rewind));
+    link_node->as.link.url = cmark_chunk_buf_detach(&buf);
+    cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
+    cmark_chunk email = cmark_chunk_dup(
+      &detached_chunk,
+      (bufsize_t)(start + offset + max_rewind - rewind),
+      (bufsize_t)(link_end + rewind));
+    cmark_chunk_to_cstr(parser->mem, &email);
+    link_text->as.literal = email;
+    cmark_node_append_child(link_node, link_text);
-  cmark_chunk_to_cstr(parser->mem, &text->as.literal);
+    cmark_node_insert_after(text, link_node);
-  cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
-  cmark_strbuf buf;
-  cmark_strbuf_init(parser->mem, &buf, 10);
-  if (auto_mailto)
-    cmark_strbuf_puts(&buf, "mailto:");
-  cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
-  link_node->as.link.url = cmark_chunk_buf_detach(&buf);
-  cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
-  cmark_chunk email = cmark_chunk_dup(
-      &text->as.literal,
-      offset + max_rewind - rewind,
-      (bufsize_t)(link_end + rewind));
-  cmark_chunk_to_cstr(parser->mem, &email);
-  link_text->as.literal = email;
-  cmark_node_append_child(link_node, link_text);
+    cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
+    post->as.literal = cmark_chunk_dup(&detached_chunk,
+                                       (bufsize_t)(start + offset + max_rewind + link_end),
+                                       (bufsize_t)(remaining - offset - max_rewind - link_end));
-  cmark_node_insert_after(text, link_node);
+    cmark_node_insert_after(link_node, post);
-  cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
-  post->as.literal = cmark_chunk_dup(&text->as.literal,
-    (bufsize_t)(offset + max_rewind + link_end),
-    (bufsize_t)(size - link_end));
-  cmark_chunk_to_cstr(parser->mem, &post->as.literal);
+    text->as.literal = cmark_chunk_dup(&detached_chunk, (bufsize_t)start, (bufsize_t)(offset + max_rewind - rewind));
+    cmark_chunk_to_cstr(parser->mem, &text->as.literal);
-  cmark_node_insert_after(link_node, post);
+    text = post;
+    start += offset + max_rewind + link_end;
+    remaining -= offset + max_rewind + link_end;
+    offset = 0;
+  }
-  text->as.literal.len = offset + max_rewind - rewind;
-  text->as.literal.data[text->as.literal.len] = 0;
+  // Convert the reference to allocated memory.
+  assert(!text->as.literal.alloc);
+  cmark_chunk_to_cstr(parser->mem, &text->as.literal);
-  postprocess_text(parser, post, 0, depth + 1);
+  // Free the detached buffer.
+  cmark_chunk_free(parser->mem, &detached_chunk);
 }
 static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
@@ -431,7 +478,7 @@ static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser
     }
     if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
-      postprocess_text(parser, node, 0, /*depth*/0);
+      postprocess_text(parser, node);
     }
   }

data/ext/commonmarker/blocks.c CHANGED Viewed

@@ -8,6 +8,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
+#include <limits.h>
 #include "cmark_ctype.h"
 #include "syntax_extension.h"
@@ -639,6 +640,14 @@ static cmark_node *finalize_document(cmark_parser *parser) {
   }
   finalize(parser, parser->root);
+  // Limit total size of extra content created from reference links to
+  // document size to avoid superlinear growth. Always allow 100KB.
+  if (parser->total_size > 100000)
+    parser->refmap->max_ref_size = parser->total_size;
+  else
+    parser->refmap->max_ref_size = 100000;
   process_inlines(parser, parser->refmap, parser->options);
   if (parser->options & CMARK_OPT_FOOTNOTES)
     process_footnotes(parser);
@@ -698,6 +707,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
   const unsigned char *end = buffer + len;
   static const uint8_t repl[] = {239, 191, 189};
+  if (len > UINT_MAX - parser->total_size)
+    parser->total_size = UINT_MAX;
+  else
+    parser->total_size += len;
   if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
     // skip NL if last buffer ended with CR ; see #117
     buffer++;

data/ext/commonmarker/cmark-gfm-extension_api.h CHANGED Viewed

@@ -114,6 +114,7 @@ typedef struct delimiter {
   struct delimiter *previous;
   struct delimiter *next;
   cmark_node *inl_text;
+  bufsize_t position;
   bufsize_t length;
   unsigned char delim_char;
   int can_open;

data/ext/commonmarker/commonmarker.c CHANGED Viewed

@@ -1304,4 +1304,5 @@ __attribute__((visibility("default"))) void Init_commonmarker() {
   rb_define_method(rb_cNode, "html_escape_html", rb_html_escape_html, 1);
   cmark_gfm_core_extensions_ensure_registered();
+  cmark_init_standard_node_flags();
 }