cataract 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci-manual-rubies.yml +44 -0
  3. data/.overcommit.yml +1 -1
  4. data/.rubocop.yml +96 -4
  5. data/.rubocop_todo.yml +186 -0
  6. data/BENCHMARKS.md +62 -141
  7. data/CHANGELOG.md +20 -0
  8. data/RAGEL_MIGRATION.md +2 -2
  9. data/README.md +37 -4
  10. data/Rakefile +72 -32
  11. data/cataract.gemspec +4 -1
  12. data/ext/cataract/cataract.c +59 -50
  13. data/ext/cataract/cataract.h +5 -3
  14. data/ext/cataract/css_parser.c +173 -65
  15. data/ext/cataract/extconf.rb +2 -2
  16. data/ext/cataract/{merge.c → flatten.c} +526 -468
  17. data/ext/cataract/shorthand_expander.c +164 -115
  18. data/lib/cataract/at_rule.rb +8 -9
  19. data/lib/cataract/declaration.rb +18 -0
  20. data/lib/cataract/import_resolver.rb +63 -43
  21. data/lib/cataract/import_statement.rb +49 -0
  22. data/lib/cataract/pure/byte_constants.rb +69 -0
  23. data/lib/cataract/pure/flatten.rb +1145 -0
  24. data/lib/cataract/pure/helpers.rb +35 -0
  25. data/lib/cataract/pure/imports.rb +268 -0
  26. data/lib/cataract/pure/parser.rb +1340 -0
  27. data/lib/cataract/pure/serializer.rb +590 -0
  28. data/lib/cataract/pure/specificity.rb +206 -0
  29. data/lib/cataract/pure.rb +153 -0
  30. data/lib/cataract/rule.rb +69 -15
  31. data/lib/cataract/stylesheet.rb +356 -49
  32. data/lib/cataract/version.rb +1 -1
  33. data/lib/cataract.rb +43 -26
  34. metadata +14 -26
  35. data/benchmarks/benchmark_harness.rb +0 -193
  36. data/benchmarks/benchmark_merging.rb +0 -121
  37. data/benchmarks/benchmark_optimization_comparison.rb +0 -168
  38. data/benchmarks/benchmark_parsing.rb +0 -153
  39. data/benchmarks/benchmark_ragel_removal.rb +0 -56
  40. data/benchmarks/benchmark_runner.rb +0 -70
  41. data/benchmarks/benchmark_serialization.rb +0 -180
  42. data/benchmarks/benchmark_shorthand.rb +0 -109
  43. data/benchmarks/benchmark_shorthand_expansion.rb +0 -176
  44. data/benchmarks/benchmark_specificity.rb +0 -124
  45. data/benchmarks/benchmark_string_allocation.rb +0 -151
  46. data/benchmarks/benchmark_stylesheet_to_s.rb +0 -62
  47. data/benchmarks/benchmark_to_s_cached.rb +0 -55
  48. data/benchmarks/benchmark_value_splitter.rb +0 -54
  49. data/benchmarks/benchmark_yjit.rb +0 -158
  50. data/benchmarks/benchmark_yjit_workers.rb +0 -61
  51. data/benchmarks/profile_to_s.rb +0 -23
  52. data/benchmarks/speedup_calculator.rb +0 -83
  53. data/benchmarks/system_metadata.rb +0 -81
  54. data/benchmarks/templates/benchmarks.md.erb +0 -221
  55. data/benchmarks/yjit_tests.rb +0 -141
  56. data/scripts/fuzzer/run.rb +0 -828
  57. data/scripts/fuzzer/worker.rb +0 -99
  58. data/scripts/generate_benchmarks_md.rb +0 -155
@@ -17,6 +17,7 @@
17
17
  typedef struct {
18
18
  VALUE rules_array; // Array of Rule structs
19
19
  VALUE media_index; // Hash: Symbol => Array of rule IDs
20
+ VALUE imports_array; // Array of ImportStatement structs
20
21
  int rule_id_counter; // Next rule ID (0-indexed)
21
22
  int media_query_count; // Safety limit for media queries
22
23
  st_table *media_cache; // Parse-time cache: string => parsed media types
@@ -361,10 +362,9 @@ static void update_media_index(ParserContext *ctx, VALUE media_sym, int rule_id)
361
362
  return; // No media query - rule applies to all media
362
363
  }
363
364
 
364
- // Add to full query symbol
365
- add_to_media_index(ctx->media_index, media_sym, rule_id);
366
-
367
- // Extract media types and add to each (if different from full query)
365
+ // Extract media types and add to each first (if different from full query)
366
+ // We add these BEFORE the full query so that when iterating the media_index hash,
367
+ // the full query comes last and takes precedence during serialization
368
368
  VALUE media_str = rb_sym2str(media_sym);
369
369
  const char *query = RSTRING_PTR(media_str);
370
370
  long query_len = RSTRING_LEN(media_str);
@@ -380,6 +380,9 @@ static void update_media_index(ParserContext *ctx, VALUE media_sym, int rule_id)
380
380
  }
381
381
  }
382
382
 
383
+ // Add to full query symbol (after media types for insertion order)
384
+ add_to_media_index(ctx->media_index, media_sym, rule_id);
385
+
383
386
  // Guard media_str since we extracted C pointer and called extract_media_types (which allocates)
384
387
  RB_GC_GUARD(media_str);
385
388
  }
@@ -412,8 +415,14 @@ static VALUE parse_declarations(const char *start, const char *end) {
412
415
  // Example: "color: red; ..."
413
416
  // ^pos ^pos (at :)
414
417
  const char *prop_start = pos;
415
- while (pos < end && *pos != ':') pos++;
416
- if (pos >= end) break; // No colon found
418
+ while (pos < end && *pos != ':' && *pos != ';') pos++;
419
+
420
+ // Malformed declaration - skip to next semicolon to recover
421
+ if (pos >= end || *pos != ':') {
422
+ while (pos < end && *pos != ';') pos++;
423
+ if (pos < end) pos++; // Skip the semicolon
424
+ continue;
425
+ }
417
426
 
418
427
  const char *prop_end = pos;
419
428
  // Trim whitespace from property
@@ -564,7 +573,7 @@ static VALUE combine_media_queries(VALUE parent, VALUE child) {
564
573
 
565
574
  /*
566
575
  * Intern media query string to symbol with safety check
567
- * Strips outer parentheses from standalone conditions like "(orientation: landscape)"
576
+ * Keeps media query exactly as written - parentheses are required per CSS spec
568
577
  */
569
578
  static VALUE intern_media_query_safe(ParserContext *ctx, const char *query_str, long query_len) {
570
579
  if (query_len == 0) {
@@ -578,38 +587,14 @@ static VALUE intern_media_query_safe(ParserContext *ctx, const char *query_str,
578
587
  MAX_MEDIA_QUERIES);
579
588
  }
580
589
 
581
- // Strip outer parentheses from standalone conditions
582
- // Example: "(orientation: landscape)" => "orientation: landscape"
583
- // But keep: "screen and (min-width: 500px)" as-is
590
+ // Keep media query exactly as written - parentheses are required per CSS spec
584
591
  const char *start = query_str;
585
592
  const char *end = query_str + query_len;
586
593
 
587
- // Trim whitespace
594
+ // Trim whitespace only
588
595
  while (start < end && IS_WHITESPACE(*start)) start++;
589
596
  while (end > start && IS_WHITESPACE(*(end - 1))) end--;
590
597
 
591
- if (end > start && *start == '(' && *(end - 1) == ')') {
592
- // Check if this is a simple wrapped condition (no other parens/operators)
593
- int depth = 0;
594
- int has_and_or = 0;
595
- for (const char *p = start; p < end; p++) {
596
- if (*p == '(') depth++;
597
- else if (*p == ')') depth--;
598
- // Check for "and" or "or" at depth 0 (outside our outer parens)
599
- if (depth == 0 && p + 3 < end &&
600
- (strncmp(p, " and ", 5) == 0 || strncmp(p, " or ", 4) == 0)) {
601
- has_and_or = 1;
602
- break;
603
- }
604
- }
605
-
606
- // Strip outer parens if depth stays >= 1 (no operators outside) and no and/or
607
- if (!has_and_or && depth == 0) {
608
- start++; // Skip opening (
609
- end--; // Skip closing )
610
- }
611
- }
612
-
613
598
  long final_len = end - start;
614
599
  VALUE query_string = rb_usascii_str_new(start, final_len);
615
600
  VALUE sym = ID2SYM(rb_intern_str(query_string));
@@ -884,6 +869,119 @@ static VALUE parse_mixed_block(ParserContext *ctx, const char *start, const char
884
869
  return declarations;
885
870
  }
886
871
 
872
+ /*
873
+ * Parse @import statement
874
+ * @import "url" [media-query];
875
+ * @import url("url") [media-query];
876
+ *
877
+ * Modifies ctx->imports_array and ctx->rule_id_counter
878
+ */
879
+ static void parse_import_statement(ParserContext *ctx, const char **p_ptr, const char *pe) {
880
+ const char *p = *p_ptr;
881
+
882
+ DEBUG_PRINTF("[IMPORT_STMT] Starting parse, input: %.50s\n", p);
883
+
884
+ // Skip whitespace
885
+ while (p < pe && IS_WHITESPACE(*p)) p++;
886
+
887
+ // Check for optional url(
888
+ int has_url_function = 0;
889
+ if (p + 4 <= pe && strncmp(p, "url(", 4) == 0) {
890
+ has_url_function = 1;
891
+ p += 4;
892
+
893
+ // Skip whitespace after url(
894
+ while (p < pe && IS_WHITESPACE(*p)) p++;
895
+ }
896
+
897
+ // Find opening quote
898
+ if (p >= pe || (*p != '"' && *p != '\'')) {
899
+ // Invalid @import, skip to semicolon
900
+ while (p < pe && *p != ';') p++;
901
+ if (p < pe) p++;
902
+ *p_ptr = p;
903
+ return;
904
+ }
905
+
906
+ char quote_char = *p;
907
+ p++; // Skip opening quote
908
+
909
+ const char *url_start = p;
910
+
911
+ // Find closing quote (handle escaped quotes)
912
+ while (p < pe && *p != quote_char) {
913
+ if (*p == '\\' && p + 1 < pe) {
914
+ p += 2; // Skip escaped character
915
+ } else {
916
+ p++;
917
+ }
918
+ }
919
+
920
+ if (p >= pe) {
921
+ // Unterminated string
922
+ *p_ptr = p;
923
+ return;
924
+ }
925
+
926
+ long url_len = p - url_start;
927
+ VALUE url = rb_utf8_str_new(url_start, url_len);
928
+ p++; // Skip closing quote
929
+
930
+ // Skip closing paren if we had url(
931
+ if (has_url_function) {
932
+ while (p < pe && IS_WHITESPACE(*p)) p++;
933
+ if (p < pe && *p == ')') p++;
934
+ }
935
+
936
+ // Skip whitespace
937
+ while (p < pe && IS_WHITESPACE(*p)) p++;
938
+
939
+ // Check for optional media query (everything until semicolon)
940
+ VALUE media = Qnil;
941
+ if (p < pe && *p != ';') {
942
+ const char *media_start = p;
943
+
944
+ // Find semicolon
945
+ while (p < pe && *p != ';') p++;
946
+
947
+ const char *media_end = p;
948
+
949
+ // Trim trailing whitespace from media query
950
+ while (media_end > media_start && IS_WHITESPACE(*(media_end - 1))) {
951
+ media_end--;
952
+ }
953
+
954
+ if (media_end > media_start) {
955
+ VALUE media_str = rb_utf8_str_new(media_start, media_end - media_start);
956
+ media = ID2SYM(rb_intern_str(media_str));
957
+ }
958
+ }
959
+
960
+ // Skip semicolon
961
+ if (p < pe && *p == ';') p++;
962
+
963
+ // Create ImportStatement (resolved: false by default)
964
+ VALUE import_stmt = rb_struct_new(cImportStatement,
965
+ INT2FIX(ctx->rule_id_counter),
966
+ url,
967
+ media,
968
+ Qfalse);
969
+
970
+ DEBUG_PRINTF("[IMPORT_STMT] Created import: id=%d, url=%s, media=%s\n",
971
+ ctx->rule_id_counter,
972
+ RSTRING_PTR(url),
973
+ NIL_P(media) ? "nil" : RSTRING_PTR(rb_sym2str(media)));
974
+
975
+ rb_ary_push(ctx->imports_array, import_stmt);
976
+ ctx->rule_id_counter++;
977
+
978
+ *p_ptr = p;
979
+
980
+ RB_GC_GUARD(url);
981
+ RB_GC_GUARD(media);
982
+ RB_GC_GUARD(import_stmt);
983
+ }
984
+
887
985
  /*
888
986
  * Parse CSS recursively with media query context and optional parent selector for nesting
889
987
  *
@@ -914,6 +1012,30 @@ static void parse_css_recursive(ParserContext *ctx, const char *css, const char
914
1012
  // Skip comments (rare in typical CSS)
915
1013
  SKIP_COMMENT(p, pe);
916
1014
 
1015
+ // Hail mary ...
1016
+ // DEBUG_PRINTF("[LOOP] At position, char='%c' (0x%02x), brace_depth=%d, next 20 chars: %.20s\n",
1017
+ // *p >= 32 && *p <= 126 ? *p : '?', (unsigned char)*p, brace_depth, p);
1018
+
1019
+ // Check for @import at-rule (only at top level, before any rules)
1020
+ if (RB_UNLIKELY(brace_depth == 0 && p + 7 < pe && *p == '@' &&
1021
+ strncmp(p + 1, "import", 6) == 0 && IS_WHITESPACE(p[7]))) {
1022
+ DEBUG_PRINTF("[IMPORT] Found @import at position, rules_count=%ld\n", RARRAY_LEN(ctx->rules_array));
1023
+ // Check if we've already seen a rule
1024
+ if (RARRAY_LEN(ctx->rules_array) > 0) {
1025
+ // Warn and skip - @import must come before rules
1026
+ rb_warn("CSS @import ignored: @import must appear before all rules (found import after rules)");
1027
+ // Skip to semicolon
1028
+ while (p < pe && *p != ';') p++;
1029
+ if (p < pe) p++;
1030
+ continue;
1031
+ }
1032
+
1033
+ p += 7; // Skip "@import "
1034
+ parse_import_statement(ctx, &p, pe);
1035
+ DEBUG_PRINTF("[IMPORT] After parsing, imports_count=%ld\n", RARRAY_LEN(ctx->imports_array));
1036
+ continue;
1037
+ }
1038
+
917
1039
  // Check for @media at-rule (only at depth 0)
918
1040
  if (RB_UNLIKELY(brace_depth == 0 && p + 6 < pe && *p == '@' &&
919
1041
  strncmp(p + 1, "media", 5) == 0 && IS_WHITESPACE(p[6]))) {
@@ -1322,6 +1444,7 @@ static void parse_css_recursive(ParserContext *ctx, const char *css, const char
1322
1444
  // Start of selector
1323
1445
  if (brace_depth == 0 && selector_start == NULL) {
1324
1446
  selector_start = p;
1447
+ DEBUG_PRINTF("[SELECTOR] Starting selector at: %.50s\n", selector_start);
1325
1448
  }
1326
1449
 
1327
1450
  p++;
@@ -1353,6 +1476,9 @@ VALUE parse_media_types(VALUE self, VALUE media_query_sym) {
1353
1476
  VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
1354
1477
  Check_Type(css_string, T_STRING);
1355
1478
 
1479
+ DEBUG_PRINTF("\n[PARSE_NEW] ========== NEW PARSE CALL ==========\n");
1480
+ DEBUG_PRINTF("[PARSE_NEW] Input CSS (first 100 chars): %.100s\n", RSTRING_PTR(css_string));
1481
+
1356
1482
  const char *css = RSTRING_PTR(css_string);
1357
1483
  const char *pe = css + RSTRING_LEN(css_string);
1358
1484
  const char *p = css;
@@ -1361,59 +1487,33 @@ VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
1361
1487
 
1362
1488
  // Extract @charset
1363
1489
  if (RSTRING_LEN(css_string) > 10 && strncmp(css, "@charset ", 9) == 0) {
1490
+ DEBUG_PRINTF("[CHARSET] Found @charset at start\n");
1364
1491
  char *quote_start = strchr(css + 9, '"');
1365
1492
  if (quote_start != NULL) {
1366
1493
  char *quote_end = strchr(quote_start + 1, '"');
1367
1494
  if (quote_end != NULL) {
1368
1495
  charset = rb_str_new(quote_start + 1, quote_end - quote_start - 1);
1496
+ DEBUG_PRINTF("[CHARSET] Extracted charset: %s\n", RSTRING_PTR(charset));
1369
1497
  char *semicolon = quote_end + 1;
1370
1498
  while (semicolon < pe && IS_WHITESPACE(*semicolon)) {
1371
1499
  semicolon++;
1372
1500
  }
1373
1501
  if (semicolon < pe && *semicolon == ';') {
1374
1502
  p = semicolon + 1;
1503
+ DEBUG_PRINTF("[CHARSET] Advanced past semicolon, remaining: %.50s\n", p);
1375
1504
  }
1376
1505
  }
1377
1506
  }
1378
1507
  }
1379
1508
 
1380
- // Skip @import statements - they should be handled by ImportResolver at Ruby level
1381
- // Per CSS spec, @import must come before all rules (except @charset)
1382
- while (p < pe) {
1383
- // Skip whitespace
1384
- while (p < pe && IS_WHITESPACE(*p)) p++;
1385
- if (p >= pe) break;
1386
-
1387
- // Skip comments
1388
- if (p + 1 < pe && p[0] == '/' && p[1] == '*') {
1389
- p += 2;
1390
- while (p + 1 < pe) {
1391
- if (p[0] == '*' && p[1] == '/') {
1392
- p += 2;
1393
- break;
1394
- }
1395
- p++;
1396
- }
1397
- continue;
1398
- }
1399
-
1400
- // Check for @import
1401
- if (p + 7 <= pe && *p == '@' && strncasecmp(p + 1, "import", 6) == 0 &&
1402
- (p + 7 >= pe || IS_WHITESPACE(p[7]) || p[7] == '\'' || p[7] == '"')) {
1403
- // Skip to semicolon
1404
- while (p < pe && *p != ';') p++;
1405
- if (p < pe) p++; // Skip semicolon
1406
- continue;
1407
- }
1408
-
1409
- // Hit non-@import content, stop skipping
1410
- break;
1411
- }
1509
+ // @import statements are now handled in parse_css_recursive
1510
+ // They must come before all rules (except @charset) per CSS spec
1412
1511
 
1413
1512
  // Initialize parser context with offset
1414
1513
  ParserContext ctx;
1415
1514
  ctx.rules_array = rb_ary_new();
1416
1515
  ctx.media_index = rb_hash_new();
1516
+ ctx.imports_array = rb_ary_new();
1417
1517
  ctx.rule_id_counter = rule_id_offset; // Start from offset
1418
1518
  ctx.media_query_count = 0;
1419
1519
  ctx.media_cache = NULL; // Removed - no perf benefit
@@ -1421,15 +1521,23 @@ VALUE parse_css_new_impl(VALUE css_string, int rule_id_offset) {
1421
1521
  ctx.depth = 0; // Start at depth 0
1422
1522
 
1423
1523
  // Parse CSS (top-level, no parent context)
1524
+ DEBUG_PRINTF("[PARSE] Starting parse_css_recursive from: %.80s\n", p);
1424
1525
  parse_css_recursive(&ctx, p, pe, NO_PARENT_MEDIA, NO_PARENT_SELECTOR, NO_PARENT_RULE_ID);
1425
1526
 
1426
1527
  // Build result hash
1427
1528
  VALUE result = rb_hash_new();
1428
1529
  rb_hash_aset(result, ID2SYM(rb_intern("rules")), ctx.rules_array);
1429
1530
  rb_hash_aset(result, ID2SYM(rb_intern("_media_index")), ctx.media_index);
1531
+ rb_hash_aset(result, ID2SYM(rb_intern("imports")), ctx.imports_array);
1430
1532
  rb_hash_aset(result, ID2SYM(rb_intern("charset")), charset);
1431
1533
  rb_hash_aset(result, ID2SYM(rb_intern("last_rule_id")), INT2FIX(ctx.rule_id_counter));
1432
1534
  rb_hash_aset(result, ID2SYM(rb_intern("_has_nesting")), ctx.has_nesting ? Qtrue : Qfalse);
1433
1535
 
1536
+ RB_GC_GUARD(charset);
1537
+ RB_GC_GUARD(ctx.rules_array);
1538
+ RB_GC_GUARD(ctx.media_index);
1539
+ RB_GC_GUARD(ctx.imports_array);
1540
+ RB_GC_GUARD(result);
1541
+
1434
1542
  return result;
1435
1543
  }
@@ -21,8 +21,8 @@ def config_str_buf_optimization?
21
21
  enable_config('str-buf-optimization', true)
22
22
  end
23
23
 
24
- # Compile main file, parser, merge, and supporting files
25
- $objs = ['cataract.o', 'css_parser.o', 'merge.o', 'shorthand_expander.o', 'specificity.o', 'value_splitter.o',
24
+ # Compile main file, parser, flatten, and supporting files
25
+ $objs = ['cataract.o', 'css_parser.o', 'flatten.o', 'shorthand_expander.o', 'specificity.o', 'value_splitter.o',
26
26
  'import_scanner.o']
27
27
 
28
28
  # Suppress warnings