RubyGems - cataract - Versions diffs - 0.1.3 → 0.2.0 - Mend

cataract 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +4 -4
data/.github/workflows/ci-manual-rubies.yml +44 -0
data/.overcommit.yml +1 -1
data/.rubocop.yml +96 -4
data/.rubocop_todo.yml +186 -0
data/BENCHMARKS.md +62 -141
data/CHANGELOG.md +20 -0
data/RAGEL_MIGRATION.md +2 -2
data/README.md +37 -4
data/Rakefile +72 -32
data/cataract.gemspec +4 -1
data/ext/cataract/cataract.c +59 -50
data/ext/cataract/cataract.h +5 -3
data/ext/cataract/css_parser.c +173 -65
data/ext/cataract/extconf.rb +2 -2
data/ext/cataract/{merge.c → flatten.c} +526 -468
data/ext/cataract/shorthand_expander.c +164 -115
data/lib/cataract/at_rule.rb +8 -9
data/lib/cataract/declaration.rb +18 -0
data/lib/cataract/import_resolver.rb +63 -43
data/lib/cataract/import_statement.rb +49 -0
data/lib/cataract/pure/byte_constants.rb +69 -0
data/lib/cataract/pure/flatten.rb +1145 -0
data/lib/cataract/pure/helpers.rb +35 -0
data/lib/cataract/pure/imports.rb +268 -0
data/lib/cataract/pure/parser.rb +1340 -0
data/lib/cataract/pure/serializer.rb +590 -0
data/lib/cataract/pure/specificity.rb +206 -0
data/lib/cataract/pure.rb +153 -0
data/lib/cataract/rule.rb +69 -15
data/lib/cataract/stylesheet.rb +356 -49
data/lib/cataract/version.rb +1 -1
data/lib/cataract.rb +43 -26
metadata +14 -26
data/benchmarks/benchmark_harness.rb +0 -193
data/benchmarks/benchmark_merging.rb +0 -121
data/benchmarks/benchmark_optimization_comparison.rb +0 -168
data/benchmarks/benchmark_parsing.rb +0 -153
data/benchmarks/benchmark_ragel_removal.rb +0 -56
data/benchmarks/benchmark_runner.rb +0 -70
data/benchmarks/benchmark_serialization.rb +0 -180
data/benchmarks/benchmark_shorthand.rb +0 -109
data/benchmarks/benchmark_shorthand_expansion.rb +0 -176
data/benchmarks/benchmark_specificity.rb +0 -124
data/benchmarks/benchmark_string_allocation.rb +0 -151
data/benchmarks/benchmark_stylesheet_to_s.rb +0 -62
data/benchmarks/benchmark_to_s_cached.rb +0 -55
data/benchmarks/benchmark_value_splitter.rb +0 -54
data/benchmarks/benchmark_yjit.rb +0 -158
data/benchmarks/benchmark_yjit_workers.rb +0 -61
data/benchmarks/profile_to_s.rb +0 -23
data/benchmarks/speedup_calculator.rb +0 -83
data/benchmarks/system_metadata.rb +0 -81
data/benchmarks/templates/benchmarks.md.erb +0 -221
data/benchmarks/yjit_tests.rb +0 -141
data/scripts/fuzzer/run.rb +0 -828
data/scripts/fuzzer/worker.rb +0 -99
data/scripts/generate_benchmarks_md.rb +0 -155

data/ext/cataract/css_parser.c CHANGED Viewed

@@ -17,6 +17,7 @@
 typedef struct {
     VALUE rules_array;        // Array of Rule structs
     VALUE media_index;        // Hash: Symbol => Array of rule IDs
+    VALUE imports_array;      // Array of ImportStatement structs
     int rule_id_counter;      // Next rule ID (0-indexed)
     int media_query_count;    // Safety limit for media queries
     st_table *media_cache;    // Parse-time cache: string => parsed media types
@@ -361,10 +362,9 @@ static void update_media_index(ParserContext *ctx, VALUE media_sym, int rule_id)
         return;  // No media query - rule applies to all media
     }
-    // Add to full query symbol
-    add_to_media_index(ctx->media_index, media_sym, rule_id);
-    // Extract media types and add to each (if different from full query)
+    // Extract media types and add to each first (if different from full query)
+    // We add these BEFORE the full query so that when iterating the media_index hash,
+    // the full query comes last and takes precedence during serialization
     VALUE media_str = rb_sym2str(media_sym);
     const char *query = RSTRING_PTR(media_str);
     long query_len = RSTRING_LEN(media_str);
@@ -380,6 +380,9 @@ static void update_media_index(ParserContext *ctx, VALUE media_sym, int rule_id)
         }
     }
+    // Add to full query symbol (after media types for insertion order)
+    add_to_media_index(ctx->media_index, media_sym, rule_id);
     // Guard media_str since we extracted C pointer and called extract_media_types (which allocates)
     RB_GC_GUARD(media_str);
 }
@@ -412,8 +415,14 @@ static VALUE parse_declarations(const char *start, const char *end) {
         // Example: "color: red; ..."
         //           ^pos  ^pos (at :)
         const char *prop_start = pos;
-        while (pos < end && *pos != ':') pos++;
-        if (pos >= end) break;  // No colon found
+        while (pos < end && *pos != ':' && *pos != ';') pos++;
+        // Malformed declaration - skip to next semicolon to recover
+        if (pos >= end || *pos != ':') {
+            while (pos < end && *pos != ';') pos++;
+            if (pos < end) pos++;  // Skip the semicolon
+            continue;
+        }
         const char *prop_end = pos;
         // Trim whitespace from property
@@ -564,7 +573,7 @@ static VALUE combine_media_queries(VALUE parent, VALUE child) {
 /*
  * Intern media query string to symbol with safety check
- * Strips outer parentheses from standalone conditions like "(orientation: landscape)"
+ * Keeps media query exactly as written - parentheses are required per CSS spec
  */
 static VALUE intern_media_query_safe(ParserContext *ctx, const char *query_str, long query_len) {
     if (query_len == 0) {
@@ -578,38 +587,14 @@ static VALUE intern_media_query_safe(ParserContext *ctx, const char *query_str,
                 MAX_MEDIA_QUERIES);
     }
-    // Strip outer parentheses from standalone conditions
-    // Example: "(orientation: landscape)" => "orientation: landscape"
-    // But keep: "screen and (min-width: 500px)" as-is
+    // Keep media query exactly as written - parentheses are required per CSS spec
     const char *start = query_str;
     const char *end = query_str + query_len;
-    // Trim whitespace
+    // Trim whitespace only
     while (start < end && IS_WHITESPACE(*start)) start++;
     while (end > start && IS_WHITESPACE(*(end - 1))) end--;
-    if (end > start && *start == '(' && *(end - 1) == ')') {
-        // Check if this is a simple wrapped condition (no other parens/operators)
-        int depth = 0;
-        int has_and_or = 0;
-        for (const char *p = start; p < end; p++) {
-            if (*p == '(') depth++;
-            else if (*p == ')') depth--;
-            // Check for "and" or "or" at depth 0 (outside our outer parens)
-            if (depth == 0 && p + 3 < end &&
-                (strncmp(p, " and ", 5) == 0 || strncmp(p, " or ", 4) == 0)) {
-                has_and_or = 1;
-                break;
-            }
-        }
-        // Strip outer parens if depth stays >= 1 (no operators outside) and no and/or
-        if (!has_and_or && depth == 0) {
-            start++;  // Skip opening (
-            end--;    // Skip closing )
-        }
-    }
     long final_len = end - start;
     VALUE query_string = rb_usascii_str_new(start, final_len);
     VALUE sym = ID2SYM(rb_intern_str(query_string));
@@ -884,6 +869,119 @@ static VALUE parse_mixed_block(ParserContext *ctx, const char *start, const char
     return declarations;
 }
+/*
+ * Parse @import statement
+ * @import "url" [media-query];
+ * @import url("url") [media-query];
+ *
+ * Modifies ctx->imports_array and ctx->rule_id_counter
+ */
+static void parse_import_statement(ParserContext *ctx, const char **p_ptr, const char *pe) {
+    const char *p = *p_ptr;
+    DEBUG_PRINTF("[IMPORT_STMT] Starting parse, input: %.50s\n", p);
+    // Skip whitespace
+    while (p < pe && IS_WHITESPACE(*p)) p++;
+    // Check for optional url(
+    int has_url_function = 0;
+    if (p + 4 <= pe && strncmp(p, "url(", 4) == 0) {
+        has_url_function = 1;
+        p += 4;
+        // Skip whitespace after url(
+        while (p < pe && IS_WHITESPACE(*p)) p++;
+    }
+    // Find opening quote
+    if (p >= pe || (*p != '"' && *p != '\'')) {
+        // Invalid @import, skip to semicolon
+        while (p < pe && *p != ';') p++;
+        if (p < pe) p++;
+        *p_ptr = p;
+        return;
+    }
+    char quote_char = *p;
+    p++; // Skip opening quote
+    const char *url_start = p;
+    // Find closing quote (handle escaped quotes)
+    while (p < pe && *p != quote_char) {
+        if (*p == '\\' && p + 1 < pe) {
+            p += 2; // Skip escaped character
+        } else {
+            p++;
+        }
+    }
+    if (p >= pe) {
+        // Unterminated string
+        *p_ptr = p;
+        return;
+    }
+    long url_len = p - url_start;
+    VALUE url = rb_utf8_str_new(url_start, url_len);
+    p++; // Skip closing quote
+    // Skip closing paren if we had url(
+    if (has_url_function) {
+        while (p < pe && IS_WHITESPACE(*p)) p++;
+        if (p < pe && *p == ')') p++;
+    }
+    // Skip whitespace
+    while (p < pe && IS_WHITESPACE(*p)) p++;
+    // Check for optional media query (everything until semicolon)
+    VALUE media = Qnil;
+    if (p < pe && *p != ';') {
+        const char *media_start = p;
+        // Find semicolon
+        while (p < pe && *p != ';') p++;
+        const char *media_end = p;
+        // Trim trailing whitespace from media query
+        while (media_end > media_start && IS_WHITESPACE(*(media_end - 1))) {
+            media_end--;
+        }
+        if (media_end > media_start) {
+            VALUE media_str = rb_utf8_str_new(media_start, media_end - media_start);
+            media = ID2SYM(rb_intern_str(media_str));
+        }
+    }
+    // Skip semicolon
+    if (p < pe && *p == ';') p++;
+    // Create ImportStatement (resolved: false by default)
+    VALUE import_stmt = rb_struct_new(cImportStatement,
+        INT2FIX(ctx->rule_id_counter),
+        url,
+        media,
+        Qfalse);
+    DEBUG_PRINTF("[IMPORT_STMT] Created import: id=%d, url=%s, media=%s\n",
+                 ctx->rule_id_counter,
+                 RSTRING_PTR(url),
+                 NIL_P(media) ? "nil" : RSTRING_PTR(rb_sym2str(media)));
+    rb_ary_push(ctx->imports_array, import_stmt);
+    ctx->rule_id_counter++;
+    *p_ptr = p;
+    RB_GC_GUARD(url);
+    RB_GC_GUARD(media);
+    RB_GC_GUARD(import_stmt);
+}
 /*
  * Parse CSS recursively with media query context and optional parent selector for nesting
  *
@@ -914,6 +1012,30 @@ static void parse_css_recursive(ParserContext *ctx, const char *css, const char
         // Skip comments (rare in typical CSS)
         SKIP_COMMENT(p, pe);
+        // Hail mary ...
+        // DEBUG_PRINTF("[LOOP] At position, char='%c' (0x%02x), brace_depth=%d, next 20 chars: %.20s\n",
+        //            *p >= 32 && *p <= 126 ? *p : '?', (unsigned char)*p, brace_depth, p);
+        // Check for @import at-rule (only at top level, before any rules)
+        if (RB_UNLIKELY(brace_depth == 0 && p + 7 < pe && *p == '@' &&
+            strncmp(p + 1, "import", 6) == 0 && IS_WHITESPACE(p[7]))) {
+            DEBUG_PRINTF("[IMPORT] Found @import at position, rules_count=%ld\n", RARRAY_LEN(ctx->rules_array));
+            // Check if we've already seen a rule
+            if (RARRAY_LEN(ctx->rules_array) > 0) {
+                // Warn and skip - @import must come before rules
+                rb_warn("CSS @import ignored: @import must appear before all rules (found import after rules)");
+                // Skip to semicolon
+                while (p < pe && *p != ';') p++;
+                if (p < pe) p++;
+                continue;
+            }
+            p += 7;  // Skip "@import "
+            parse_import_statement(ctx, &p, pe);
+            DEBUG_PRINTF("[IMPORT] After parsing, imports_count=%ld\n", RARRAY_LEN(ctx->imports_array));
+            continue;
+        }
         // Check for @media at-rule (only at depth 0)
         if (RB_UNLIKELY(brace_depth == 0 && p + 6 < pe && *p == '@' &&
             strncmp(p + 1, "media", 5) == 0 && IS_WHITESPACE(p[6]))) {
@@ -1322,6 +1444,7 @@ static void parse_css_recursive(ParserContext *ctx, const char *css, const char
         // Start of selector
         if (brace_depth == 0 && selector_start == NULL) {
             selector_start = p;
+            DEBUG_PRINTF("[SELECTOR] Starting selector at: %.50s\n", selector_start);
         }
         p++;
@@ -1353,6 +1476,9 @@ VALUE parse_media_types(VALUE self, VALUE media_query_sym) {
 VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
     Check_Type(css_string, T_STRING);
+    DEBUG_PRINTF("\n[PARSE_NEW] ========== NEW PARSE CALL ==========\n");
+    DEBUG_PRINTF("[PARSE_NEW] Input CSS (first 100 chars): %.100s\n", RSTRING_PTR(css_string));
     const char *css = RSTRING_PTR(css_string);
     const char *pe = css + RSTRING_LEN(css_string);
     const char *p = css;
@@ -1361,59 +1487,33 @@ VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
     // Extract @charset
     if (RSTRING_LEN(css_string) > 10 && strncmp(css, "@charset ", 9) == 0) {
+        DEBUG_PRINTF("[CHARSET] Found @charset at start\n");
         char *quote_start = strchr(css + 9, '"');
         if (quote_start != NULL) {
             char *quote_end = strchr(quote_start + 1, '"');
             if (quote_end != NULL) {
                 charset = rb_str_new(quote_start + 1, quote_end - quote_start - 1);
+                DEBUG_PRINTF("[CHARSET] Extracted charset: %s\n", RSTRING_PTR(charset));
                 char *semicolon = quote_end + 1;
                 while (semicolon < pe && IS_WHITESPACE(*semicolon)) {
                     semicolon++;
                 }
                 if (semicolon < pe && *semicolon == ';') {
                     p = semicolon + 1;
+                    DEBUG_PRINTF("[CHARSET] Advanced past semicolon, remaining: %.50s\n", p);
                 }
             }
         }
     }
-    // Skip @import statements - they should be handled by ImportResolver at Ruby level
-    // Per CSS spec, @import must come before all rules (except @charset)
-    while (p < pe) {
-        // Skip whitespace
-        while (p < pe && IS_WHITESPACE(*p)) p++;
-        if (p >= pe) break;
-        // Skip comments
-        if (p + 1 < pe && p[0] == '/' && p[1] == '*') {
-            p += 2;
-            while (p + 1 < pe) {
-                if (p[0] == '*' && p[1] == '/') {
-                    p += 2;
-                    break;
-                }
-                p++;
-            }
-            continue;
-        }
-        // Check for @import
-        if (p + 7 <= pe && *p == '@' && strncasecmp(p + 1, "import", 6) == 0 &&
-            (p + 7 >= pe || IS_WHITESPACE(p[7]) || p[7] == '\'' || p[7] == '"')) {
-            // Skip to semicolon
-            while (p < pe && *p != ';') p++;
-            if (p < pe) p++; // Skip semicolon
-            continue;
-        }
-        // Hit non-@import content, stop skipping
-        break;
-    }
+    // @import statements are now handled in parse_css_recursive
+    // They must come before all rules (except @charset) per CSS spec
     // Initialize parser context with offset
     ParserContext ctx;
     ctx.rules_array = rb_ary_new();
     ctx.media_index = rb_hash_new();
+    ctx.imports_array = rb_ary_new();
     ctx.rule_id_counter = rule_id_offset;  // Start from offset
     ctx.media_query_count = 0;
     ctx.media_cache = NULL;  // Removed - no perf benefit
@@ -1421,15 +1521,23 @@ VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
     ctx.depth = 0;  // Start at depth 0
     // Parse CSS (top-level, no parent context)
+    DEBUG_PRINTF("[PARSE] Starting parse_css_recursive from: %.80s\n", p);
     parse_css_recursive(&ctx, p, pe, NO_PARENT_MEDIA, NO_PARENT_SELECTOR, NO_PARENT_RULE_ID);
     // Build result hash
     VALUE result = rb_hash_new();
     rb_hash_aset(result, ID2SYM(rb_intern("rules")), ctx.rules_array);
     rb_hash_aset(result, ID2SYM(rb_intern("_media_index")), ctx.media_index);
+    rb_hash_aset(result, ID2SYM(rb_intern("imports")), ctx.imports_array);
     rb_hash_aset(result, ID2SYM(rb_intern("charset")), charset);
     rb_hash_aset(result, ID2SYM(rb_intern("last_rule_id")), INT2FIX(ctx.rule_id_counter));
     rb_hash_aset(result, ID2SYM(rb_intern("_has_nesting")), ctx.has_nesting ? Qtrue : Qfalse);
+    RB_GC_GUARD(charset);
+    RB_GC_GUARD(ctx.rules_array);
+    RB_GC_GUARD(ctx.media_index);
+    RB_GC_GUARD(ctx.imports_array);
+    RB_GC_GUARD(result);
     return result;
 }

data/ext/cataract/extconf.rb CHANGED Viewed

@@ -21,8 +21,8 @@ def config_str_buf_optimization?
   enable_config('str-buf-optimization', true)
 end
-# Compile main file, parser, merge, and supporting files
-$objs = ['cataract.o', 'css_parser.o', 'merge.o', 'shorthand_expander.o', 'specificity.o', 'value_splitter.o',
+# Compile main file, parser, flatten, and supporting files
+$objs = ['cataract.o', 'css_parser.o', 'flatten.o', 'shorthand_expander.o', 'specificity.o', 'value_splitter.o',
          'import_scanner.o']
 # Suppress warnings