cataract 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-tidy +30 -0
  3. data/.github/workflows/ci-macos.yml +12 -0
  4. data/.github/workflows/ci.yml +77 -0
  5. data/.github/workflows/test.yml +76 -0
  6. data/.gitignore +45 -0
  7. data/.overcommit.yml +38 -0
  8. data/.rubocop.yml +83 -0
  9. data/BENCHMARKS.md +201 -0
  10. data/CHANGELOG.md +1 -0
  11. data/Gemfile +27 -0
  12. data/LICENSE +21 -0
  13. data/RAGEL_MIGRATION.md +60 -0
  14. data/README.md +292 -0
  15. data/Rakefile +209 -0
  16. data/benchmarks/benchmark_harness.rb +193 -0
  17. data/benchmarks/benchmark_merging.rb +121 -0
  18. data/benchmarks/benchmark_optimization_comparison.rb +168 -0
  19. data/benchmarks/benchmark_parsing.rb +153 -0
  20. data/benchmarks/benchmark_ragel_removal.rb +56 -0
  21. data/benchmarks/benchmark_runner.rb +70 -0
  22. data/benchmarks/benchmark_serialization.rb +180 -0
  23. data/benchmarks/benchmark_shorthand.rb +109 -0
  24. data/benchmarks/benchmark_shorthand_expansion.rb +176 -0
  25. data/benchmarks/benchmark_specificity.rb +124 -0
  26. data/benchmarks/benchmark_string_allocation.rb +151 -0
  27. data/benchmarks/benchmark_stylesheet_to_s.rb +62 -0
  28. data/benchmarks/benchmark_to_s_cached.rb +55 -0
  29. data/benchmarks/benchmark_value_splitter.rb +54 -0
  30. data/benchmarks/benchmark_yjit.rb +158 -0
  31. data/benchmarks/benchmark_yjit_workers.rb +61 -0
  32. data/benchmarks/profile_to_s.rb +23 -0
  33. data/benchmarks/speedup_calculator.rb +83 -0
  34. data/benchmarks/system_metadata.rb +81 -0
  35. data/benchmarks/templates/benchmarks.md.erb +221 -0
  36. data/benchmarks/yjit_tests.rb +141 -0
  37. data/cataract.gemspec +34 -0
  38. data/cliff.toml +92 -0
  39. data/examples/color_conversion_visual_test/color_conversion_test.html +3603 -0
  40. data/examples/color_conversion_visual_test/generate.rb +202 -0
  41. data/examples/color_conversion_visual_test/template.html.erb +259 -0
  42. data/examples/css_analyzer/analyzer.rb +164 -0
  43. data/examples/css_analyzer/analyzers/base.rb +33 -0
  44. data/examples/css_analyzer/analyzers/colors.rb +133 -0
  45. data/examples/css_analyzer/analyzers/important.rb +88 -0
  46. data/examples/css_analyzer/analyzers/properties.rb +61 -0
  47. data/examples/css_analyzer/analyzers/specificity.rb +68 -0
  48. data/examples/css_analyzer/templates/report.html.erb +575 -0
  49. data/examples/css_analyzer.rb +69 -0
  50. data/examples/github_analysis.html +5343 -0
  51. data/ext/cataract/cataract.c +1086 -0
  52. data/ext/cataract/cataract.h +174 -0
  53. data/ext/cataract/css_parser.c +1435 -0
  54. data/ext/cataract/extconf.rb +48 -0
  55. data/ext/cataract/import_scanner.c +174 -0
  56. data/ext/cataract/merge.c +973 -0
  57. data/ext/cataract/shorthand_expander.c +902 -0
  58. data/ext/cataract/specificity.c +213 -0
  59. data/ext/cataract/value_splitter.c +116 -0
  60. data/ext/cataract_color/cataract_color.c +16 -0
  61. data/ext/cataract_color/color_conversion.c +1687 -0
  62. data/ext/cataract_color/color_conversion.h +136 -0
  63. data/ext/cataract_color/color_conversion_lab.c +571 -0
  64. data/ext/cataract_color/color_conversion_named.c +259 -0
  65. data/ext/cataract_color/color_conversion_oklab.c +547 -0
  66. data/ext/cataract_color/extconf.rb +23 -0
  67. data/ext/cataract_old/cataract.c +393 -0
  68. data/ext/cataract_old/cataract.h +250 -0
  69. data/ext/cataract_old/css_parser.c +933 -0
  70. data/ext/cataract_old/extconf.rb +67 -0
  71. data/ext/cataract_old/import_scanner.c +174 -0
  72. data/ext/cataract_old/merge.c +776 -0
  73. data/ext/cataract_old/shorthand_expander.c +902 -0
  74. data/ext/cataract_old/specificity.c +213 -0
  75. data/ext/cataract_old/stylesheet.c +290 -0
  76. data/ext/cataract_old/value_splitter.c +116 -0
  77. data/lib/cataract/at_rule.rb +97 -0
  78. data/lib/cataract/color_conversion.rb +18 -0
  79. data/lib/cataract/declarations.rb +332 -0
  80. data/lib/cataract/import_resolver.rb +210 -0
  81. data/lib/cataract/rule.rb +131 -0
  82. data/lib/cataract/stylesheet.rb +716 -0
  83. data/lib/cataract/stylesheet_scope.rb +257 -0
  84. data/lib/cataract/version.rb +5 -0
  85. data/lib/cataract.rb +107 -0
  86. data/lib/tasks/gem.rake +158 -0
  87. data/scripts/fuzzer/run.rb +828 -0
  88. data/scripts/fuzzer/worker.rb +99 -0
  89. data/scripts/generate_benchmarks_md.rb +155 -0
  90. metadata +135 -0
@@ -0,0 +1,933 @@
1
+ /*
2
+ * css_parser.c - CSS parser implementation
3
+ *
4
+ * Handles: selectors, declaration blocks, @media, @supports, @keyframes, @font-face, etc.
5
+ *
6
+ * This is a character-by-character state machine parser.
7
+ */
8
+
9
+ #include "cataract.h"
10
+ #include <string.h>
11
+
12
+ // Parser states
13
+ typedef enum {
14
+ STATE_INITIAL, // Start of parsing or after closing }
15
+ STATE_SELECTOR, // Parsing selector
16
+ STATE_DECLARATIONS // Inside { } parsing declarations
17
+ } ParserState;
18
+
19
+ // Forward declarations
20
+ VALUE parse_css_impl(VALUE css_string, int depth, VALUE parent_media_query);
21
+ VALUE parse_media_query(const char *query_str, long query_len);
22
+ VALUE parse_declarations_string(const char *start, const char *end);
23
+ static char* copy_without_comments(const char *start, const char *end, long *out_len);
24
+
25
+ // Context for merging hash callbacks
26
+ struct merge_hash_ctx {
27
+ VALUE target_hash;
28
+ };
29
+
30
+ // Callback for merging inner_hash into target hash
31
+ // Both inner and target have structure: {query_string => {media_types: [...], rules: [...]}}
32
+ static int merge_hash_callback(VALUE key, VALUE inner_group, VALUE arg) {
33
+ struct merge_hash_ctx *ctx = (struct merge_hash_ctx *)arg;
34
+ VALUE our_group = rb_hash_aref(ctx->target_hash, key);
35
+
36
+ if (NIL_P(our_group)) {
37
+ // No existing group for this query string - just add it
38
+ rb_hash_aset(ctx->target_hash, key, inner_group);
39
+ } else {
40
+ // Merge the rules arrays from both groups
41
+ VALUE our_rules = rb_hash_aref(our_group, ID2SYM(rb_intern("rules")));
42
+ VALUE inner_rules = rb_hash_aref(inner_group, ID2SYM(rb_intern("rules")));
43
+
44
+ long inner_len = RARRAY_LEN(inner_rules);
45
+ for (long i = 0; i < inner_len; i++) {
46
+ rb_ary_push(our_rules, RARRAY_AREF(inner_rules, i));
47
+ }
48
+ }
49
+
50
+ return ST_CONTINUE;
51
+ }
52
+
53
+ // ============================================================================
54
+ // CSS Parsing Helper Functions
55
+ // ============================================================================
56
+
57
+ // Parse declaration block and extract Declaration structs
58
+ void capture_declarations_fn(
59
+ const char **decl_start_ptr,
60
+ const char *p,
61
+ VALUE *current_declarations,
62
+ const char *css_string_base
63
+ ) {
64
+ // Guard against multiple firings - only process if decl_start is set
65
+ if (*decl_start_ptr == NULL) {
66
+ DEBUG_PRINTF("[capture_declarations] SKIPPED: decl_start is NULL\n");
67
+ return;
68
+ }
69
+
70
+ const char *decl_start = *decl_start_ptr;
71
+
72
+ // Initialize declarations array if needed
73
+ if (NIL_P(*current_declarations)) {
74
+ *current_declarations = rb_ary_new();
75
+ }
76
+
77
+ const char *start = decl_start;
78
+ const char *end = p;
79
+
80
+ DEBUG_PRINTF("[capture_declarations] Parsing declarations from %td to %td: '%.*s'\n",
81
+ (ptrdiff_t)(decl_start - css_string_base), (ptrdiff_t)(p - css_string_base),
82
+ (int)(end - start), start);
83
+
84
+ // Fast path: check if there are any comments in the declaration block
85
+ int has_comments = 0;
86
+ for (const char *check = start; check + 1 < end; check++) {
87
+ if (*check == '/' && *(check + 1) == '*') {
88
+ has_comments = 1;
89
+ break;
90
+ }
91
+ }
92
+
93
+ // If there are comments, strip them first (rare case)
94
+ char *clean_buffer = NULL;
95
+ const char *clean_end = end;
96
+ if (has_comments) {
97
+ long clean_len;
98
+ clean_buffer = copy_without_comments(start, end, &clean_len);
99
+ start = clean_buffer;
100
+ clean_end = clean_buffer + clean_len;
101
+ }
102
+
103
+ // Simple C-level parser for declarations
104
+ // Input: "color: red; background: blue !important"
105
+ // Output: Array of Declaration structs
106
+ const char *pos = start;
107
+ while (pos < clean_end) {
108
+ // Skip whitespace and semicolons
109
+ while (pos < clean_end && (IS_WHITESPACE(*pos) || *pos == ';')) {
110
+ pos++;
111
+ }
112
+ if (pos >= clean_end) break;
113
+
114
+ // Find property (up to colon)
115
+ const char *prop_start = pos;
116
+ while (pos < clean_end && *pos != ':') pos++;
117
+ if (pos >= clean_end) break; // No colon found
118
+
119
+ const char *prop_end = pos;
120
+ // Trim whitespace from property
121
+ trim_trailing(prop_start, &prop_end);
122
+ trim_leading(&prop_start, prop_end);
123
+
124
+ pos++; // Skip colon
125
+
126
+ // Skip whitespace after colon
127
+ while (pos < clean_end && IS_WHITESPACE(*pos)) {
128
+ pos++;
129
+ }
130
+
131
+ // Find value (up to semicolon or end)
132
+ // Handle parentheses: semicolons inside () don't terminate the value
133
+ const char *val_start = pos;
134
+ int paren_depth = 0;
135
+ while (pos < clean_end) {
136
+ if (*pos == '(') {
137
+ paren_depth++;
138
+ } else if (*pos == ')') {
139
+ paren_depth--;
140
+ } else if (*pos == ';' && paren_depth == 0) {
141
+ break; // Found terminating semicolon
142
+ }
143
+ pos++;
144
+ }
145
+ const char *val_end = pos;
146
+
147
+ // Trim trailing whitespace from value
148
+ trim_trailing(val_start, &val_end);
149
+
150
+ // Check for !important
151
+ int is_important = 0;
152
+ // Look backwards for "!important"
153
+ if (val_end - val_start >= 10) { // strlen("!important") = 10
154
+ const char *check = val_end - 10;
155
+ while (check < val_end && IS_WHITESPACE(*check)) check++;
156
+ if (check < val_end && *check == '!') {
157
+ check++;
158
+ while (check < val_end && IS_WHITESPACE(*check)) check++;
159
+ if ((val_end - check) >= 9 && strncmp(check, "important", 9) == 0) {
160
+ is_important = 1;
161
+ const char *important_pos = check - 1;
162
+ while (important_pos > val_start && (IS_WHITESPACE(*(important_pos-1)) || *(important_pos-1) == '!')) {
163
+ important_pos--;
164
+ }
165
+ val_end = important_pos;
166
+ }
167
+ }
168
+ }
169
+
170
+ // Final trim of trailing whitespace/newlines from value (after !important removal)
171
+ trim_trailing(val_start, &val_end);
172
+
173
+ // Skip if value is empty (e.g., "color: !important" with no actual value)
174
+ if (val_end > val_start) {
175
+ // Sanity check: property name length
176
+ long prop_len = prop_end - prop_start;
177
+ if (prop_len > MAX_PROPERTY_NAME_LENGTH) {
178
+ DEBUG_PRINTF("[capture_declarations] Skipping property: name too long (%ld > %d)\n",
179
+ prop_len, MAX_PROPERTY_NAME_LENGTH);
180
+ continue;
181
+ }
182
+
183
+ // Sanity check: value length
184
+ long val_len = val_end - val_start;
185
+ if (val_len > MAX_PROPERTY_VALUE_LENGTH) {
186
+ DEBUG_PRINTF("[capture_declarations] Skipping property: value too long (%ld > %d)\n",
187
+ val_len, MAX_PROPERTY_VALUE_LENGTH);
188
+ continue;
189
+ }
190
+
191
+ // Create property string and lowercase it (CSS property names are ASCII-only)
192
+ VALUE property_raw = rb_usascii_str_new(prop_start, prop_len);
193
+ VALUE property = lowercase_property(property_raw);
194
+ VALUE value = rb_utf8_str_new(val_start, val_end - val_start);
195
+
196
+ DEBUG_PRINTF("[capture_declarations] Found: property='%s' value='%s' important=%d\n",
197
+ RSTRING_PTR(property), RSTRING_PTR(value), is_important);
198
+
199
+ // Create Declaration struct
200
+ VALUE decl = rb_struct_new(
201
+ cDeclaration,
202
+ property,
203
+ value,
204
+ is_important ? Qtrue : Qfalse
205
+ );
206
+
207
+ rb_ary_push(*current_declarations, decl);
208
+
209
+ // Protect temporaries from GC (in case compiler optimizes them to registers)
210
+ RB_GC_GUARD(property);
211
+ RB_GC_GUARD(value);
212
+ RB_GC_GUARD(decl);
213
+ } else {
214
+ DEBUG_PRINTF("[capture_declarations] Skipping empty value for property at pos %td\n",
215
+ (ptrdiff_t)(prop_start - css_string_base));
216
+ }
217
+
218
+ if (pos < clean_end && *pos == ';') pos++; // Skip semicolon if present
219
+ }
220
+
221
+ // Free temporary buffer if allocated
222
+ if (clean_buffer) {
223
+ xfree(clean_buffer);
224
+ }
225
+
226
+ // Reset for next rule
227
+ *decl_start_ptr = NULL;
228
+ }
229
+
230
+ // Create Rule structs from current selectors and declarations
231
+ void finish_rule_fn(
232
+ int inside_at_rule_block,
233
+ VALUE *current_selectors,
234
+ VALUE *current_declarations,
235
+ VALUE *current_media_types,
236
+ VALUE rules_by_media, // Hash: {query_string => {media_types: [...], rules: [...]}}
237
+ const char **mark_ptr
238
+ ) {
239
+ // Skip if we're scanning at-rule block content (will be parsed recursively)
240
+ if (inside_at_rule_block) {
241
+ DEBUG_PRINTF("[finish_rule] SKIPPED (inside media block)\n");
242
+ goto cleanup;
243
+ }
244
+
245
+ // Create one rule for each selector in the list
246
+ if (NIL_P(*current_selectors) || NIL_P(*current_declarations)) {
247
+ goto cleanup;
248
+ }
249
+
250
+ long len = RARRAY_LEN(*current_selectors);
251
+ DEBUG_PRINTF("[finish_rule] Creating %ld rule(s)\n", len);
252
+
253
+ for (long i = 0; i < len; i++) {
254
+ VALUE sel = RARRAY_AREF(*current_selectors, i);
255
+ DEBUG_PRINTF("[finish_rule] Rule %ld: selector='%s'\n", i, RSTRING_PTR(sel));
256
+
257
+ // Determine media query string for grouping
258
+ VALUE query_string = Qnil;
259
+ VALUE media_types_array = Qnil;
260
+
261
+ if (!NIL_P(*current_media_types)) {
262
+ query_string = rb_hash_aref(*current_media_types, ID2SYM(rb_intern("query_string")));
263
+ media_types_array = rb_hash_aref(*current_media_types, ID2SYM(rb_intern("media_types")));
264
+ DEBUG_PRINTF("[finish_rule] current_media_types present, query_string=%s\n",
265
+ NIL_P(query_string) ? "nil" : RSTRING_PTR(query_string));
266
+ } else {
267
+ DEBUG_PRINTF("[finish_rule] No media types (default/all)\n");
268
+ }
269
+ // query_string is nil for non-media rules (default/all)
270
+
271
+ // Create rule (media info stored at group level, not on rule)
272
+ VALUE rule = rb_struct_new(cRule,
273
+ sel, // selector
274
+ rb_ary_dup(*current_declarations), // declarations
275
+ Qnil // specificity (calculated on demand)
276
+ );
277
+
278
+ // Get or create the group structure for this media query
279
+ VALUE group = rb_hash_aref(rules_by_media, query_string);
280
+ if (NIL_P(group)) {
281
+ // Create new group: {media_types: [...], rules: [...]}
282
+ group = rb_hash_new();
283
+ // Default to [:all] for non-media rules (css_parser gem compatibility)
284
+ VALUE media_types_for_group = NIL_P(media_types_array) ?
285
+ rb_ary_new_from_args(1, ID2SYM(rb_intern("all"))) :
286
+ media_types_array;
287
+ rb_hash_aset(group, ID2SYM(rb_intern("media_types")), media_types_for_group);
288
+ rb_hash_aset(group, ID2SYM(rb_intern("rules")), rb_ary_new());
289
+ rb_hash_aset(rules_by_media, query_string, group);
290
+ }
291
+
292
+ // Add rule to the group's rules array
293
+ VALUE rules_array = rb_hash_aref(group, ID2SYM(rb_intern("rules")));
294
+ rb_ary_push(rules_array, rule);
295
+ }
296
+
297
+ cleanup:
298
+ *current_selectors = Qnil;
299
+ *current_declarations = Qnil;
300
+ // Reset mark for next rule (in case it wasn't reset by capture action)
301
+ *mark_ptr = NULL;
302
+ }
303
+
304
+ // Parse media query string and return hash with query string and media types
305
+ // Returns: {query_string: "...", media_types: [...]}
306
+ // Example: "screen and (min-width: 768px)" -> {query_string: "screen and (min-width: 768px)", media_types: [:screen]}
307
+ // Example: "screen, print" -> {query_string: "screen, print", media_types: [:screen, :print]}
308
+ //
309
+ // Algorithm: Scan for identifiers (alphanumeric + dash), skip keywords and parens
310
+ VALUE parse_media_query(const char *query_str, long query_len) {
311
+ VALUE mq_types = rb_ary_new();
312
+
313
+ const char *p = query_str;
314
+ const char *pe = query_str + query_len;
315
+ int in_parens = 0;
316
+
317
+ while (p < pe) {
318
+ // Skip whitespace
319
+ while (p < pe && IS_WHITESPACE(*p)) p++;
320
+ if (p >= pe) break;
321
+
322
+ // Track parentheses (skip content inside parens like "(min-width: 768px)")
323
+ if (*p == '(') {
324
+ in_parens++;
325
+ p++;
326
+ continue;
327
+ }
328
+ if (*p == ')') {
329
+ if (in_parens > 0) in_parens--;
330
+ p++;
331
+ continue;
332
+ }
333
+
334
+ // Skip non-identifier characters when not in parens
335
+ if (!in_parens && (*p == ',' || *p == ':' || *p == ';')) {
336
+ p++;
337
+ continue;
338
+ }
339
+
340
+ // Inside parens - skip everything
341
+ if (in_parens) {
342
+ p++;
343
+ continue;
344
+ }
345
+
346
+ // Check if this looks like an identifier start (letter or dash)
347
+ if ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || *p == '-') {
348
+ const char *ident_start = p;
349
+
350
+ // Scan identifier (letters, digits, dashes)
351
+ while (p < pe && ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') ||
352
+ (*p >= '0' && *p <= '9') || *p == '-')) {
353
+ p++;
354
+ }
355
+
356
+ long ident_len = p - ident_start;
357
+
358
+ // Check if it's a keyword to skip
359
+ int is_keyword =
360
+ (ident_len == 3 && (strncmp(ident_start, "and", 3) == 0 || strncmp(ident_start, "not", 3) == 0)) ||
361
+ (ident_len == 2 && strncmp(ident_start, "or", 2) == 0) ||
362
+ (ident_len == 4 && strncmp(ident_start, "only", 4) == 0);
363
+
364
+ if (!is_keyword) {
365
+ // Capture as media type
366
+ ID media_id = rb_intern2(ident_start, ident_len);
367
+ VALUE media_sym = ID2SYM(media_id);
368
+ rb_ary_push(mq_types, media_sym);
369
+ DEBUG_PRINTF("[mq_parser] captured media type: %.*s\n", (int)ident_len, ident_start);
370
+ } else {
371
+ DEBUG_PRINTF("[mq_parser] skipped keyword: %.*s\n", (int)ident_len, ident_start);
372
+ }
373
+ } else {
374
+ // Not an identifier, skip character
375
+ p++;
376
+ }
377
+ }
378
+
379
+ // Return hash with both query string and media types array
380
+ VALUE result = rb_hash_new();
381
+ VALUE query_string = rb_utf8_str_new(query_str, query_len);
382
+ rb_hash_aset(result, ID2SYM(rb_intern("query_string")), query_string);
383
+ rb_hash_aset(result, ID2SYM(rb_intern("media_types")), mq_types);
384
+
385
+ return result;
386
+ }
387
+
388
+ // Helper: Copy string segment skipping comments
389
+ // Allocates new buffer and returns it with new length
390
+ static char* copy_without_comments(const char *start, const char *end, long *out_len) {
391
+ long max_len = end - start;
392
+ char *buffer = ALLOC_N(char, max_len);
393
+ char *write_pos = buffer;
394
+ const char *read_pos = start;
395
+
396
+ while (read_pos < end) {
397
+ // Check for comment start
398
+ if (read_pos + 1 < end && *read_pos == '/' && *(read_pos + 1) == '*') {
399
+ // Skip past comment
400
+ read_pos += 2;
401
+ while (read_pos + 1 < end) {
402
+ if (*read_pos == '*' && *(read_pos + 1) == '/') {
403
+ read_pos += 2;
404
+ break;
405
+ }
406
+ read_pos++;
407
+ }
408
+ } else {
409
+ // Copy character
410
+ *write_pos++ = *read_pos++;
411
+ }
412
+ }
413
+
414
+ *out_len = write_pos - buffer;
415
+ return buffer;
416
+ }
417
+
418
+ // Parse declarations string into array of Declaration structs
419
+ // Used by parse_declarations Ruby wrapper
420
+ VALUE parse_declarations_string(const char *start, const char *end) {
421
+ VALUE declarations = rb_ary_new();
422
+
423
+ // Fast path: check if there are any comments
424
+ int has_comments = 0;
425
+ for (const char *check = start; check + 1 < end; check++) {
426
+ if (*check == '/' && *(check + 1) == '*') {
427
+ has_comments = 1;
428
+ break;
429
+ }
430
+ }
431
+
432
+ // If there are comments, strip them first (rare case)
433
+ char *clean_buffer = NULL;
434
+ const char *clean_end = end;
435
+ if (has_comments) {
436
+ long clean_len;
437
+ clean_buffer = copy_without_comments(start, end, &clean_len);
438
+ start = clean_buffer;
439
+ clean_end = clean_buffer + clean_len;
440
+ }
441
+
442
+ const char *pos = start;
443
+ while (pos < clean_end) {
444
+ // Skip whitespace and semicolons
445
+ while (pos < clean_end && (IS_WHITESPACE(*pos) || *pos == ';')) pos++;
446
+ if (pos >= clean_end) break;
447
+
448
+ // Find property (up to colon)
449
+ const char *prop_start = pos;
450
+ while (pos < clean_end && *pos != ':') pos++;
451
+ if (pos >= clean_end) break; // No colon found
452
+
453
+ const char *prop_end = pos;
454
+ trim_trailing(prop_start, &prop_end);
455
+ trim_leading(&prop_start, prop_end);
456
+
457
+ pos++; // Skip colon
458
+ trim_leading(&pos, clean_end);
459
+
460
+ // Find value (up to semicolon or end), handling parentheses
461
+ const char *val_start = pos;
462
+ int paren_depth = 0;
463
+ while (pos < clean_end) {
464
+ if (*pos == '(') paren_depth++;
465
+ else if (*pos == ')') paren_depth--;
466
+ else if (*pos == ';' && paren_depth == 0) break;
467
+ pos++;
468
+ }
469
+ const char *val_end = pos;
470
+ trim_trailing(val_start, &val_end);
471
+
472
+ // Check for !important
473
+ int is_important = 0;
474
+ if (val_end - val_start >= 10) { // strlen("!important") = 10
475
+ const char *check = val_end - 10;
476
+ while (check < val_end && IS_WHITESPACE(*check)) check++;
477
+ if (check < val_end && *check == '!') {
478
+ check++;
479
+ while (check < val_end && IS_WHITESPACE(*check)) check++;
480
+ if ((val_end - check) >= 9 && strncmp(check, "important", 9) == 0) {
481
+ is_important = 1;
482
+ const char *important_pos = check - 1;
483
+ while (important_pos > val_start && (IS_WHITESPACE(*(important_pos-1)) || *(important_pos-1) == '!')) {
484
+ important_pos--;
485
+ }
486
+ val_end = important_pos;
487
+ trim_trailing(val_start, &val_end);
488
+ }
489
+ }
490
+ }
491
+
492
+ // Skip if value is empty
493
+ if (val_end > val_start) {
494
+ long prop_len = prop_end - prop_start;
495
+ if (prop_len > MAX_PROPERTY_NAME_LENGTH) continue;
496
+
497
+ long val_len = val_end - val_start;
498
+ if (val_len > MAX_PROPERTY_VALUE_LENGTH) continue;
499
+
500
+ // Create property string and lowercase it
501
+ VALUE property_raw = rb_usascii_str_new(prop_start, prop_len);
502
+ VALUE property = lowercase_property(property_raw);
503
+ VALUE value = rb_utf8_str_new(val_start, val_len);
504
+
505
+ // Create Declaration struct
506
+ VALUE decl = rb_struct_new(cDeclaration,
507
+ property, value, is_important ? Qtrue : Qfalse);
508
+
509
+ rb_ary_push(declarations, decl);
510
+ }
511
+ }
512
+
513
+ // Free temporary buffer if allocated
514
+ if (clean_buffer) {
515
+ xfree(clean_buffer);
516
+ }
517
+
518
+ return declarations;
519
+ }
520
+
521
+ // ============================================================================
522
+ // Main CSS Parser
523
+ // ============================================================================
524
+
525
+ /*
526
+ * CSS parser implementation
527
+ *
528
+ * Parses selectors, declarations, and @rules. Creates Rule structs.
529
+ *
530
+ * @param css_string [String] CSS to parse
531
+ * @param depth [Integer] Recursion depth (for error handling)
532
+ * @param parent_media_query [VALUE] Parent media query hash (for nested @media), or Qnil
533
+ * @return [Hash] {query_string => [Rule]} grouped by media query
534
+ */
535
+ VALUE parse_css_impl(VALUE css_string, int depth, VALUE parent_media_query) {
536
+ Check_Type(css_string, T_STRING);
537
+
538
+ const char *p = RSTRING_PTR(css_string);
539
+ const char *pe = p + RSTRING_LEN(css_string);
540
+ const char *css_string_base = p;
541
+
542
+ // State variables
543
+ ParserState state = STATE_INITIAL;
544
+ const char *mark = NULL;
545
+ const char *decl_start = NULL;
546
+ const char *selector_start = NULL;
547
+
548
+ // Ruby objects
549
+ VALUE rules_by_media = rb_hash_new(); // Hash: {query_string => {media_types: [...], rules: [...]}}
550
+ VALUE current_selectors = Qnil;
551
+ VALUE current_declarations = Qnil;
552
+ VALUE selector = Qnil;
553
+ VALUE current_media_types = parent_media_query; // Inherit parent's media context
554
+
555
+ while (p < pe) {
556
+ char c = *p;
557
+
558
+ // Skip whitespace in most states
559
+ if (IS_WHITESPACE(c) && state != STATE_DECLARATIONS && state != STATE_SELECTOR) {
560
+ p++;
561
+ continue;
562
+ }
563
+
564
+ // Skip comments everywhere
565
+ if (c == '/' && p + 1 < pe && *(p + 1) == '*') {
566
+ // Find end of comment
567
+ p += 2;
568
+ while (p + 1 < pe) {
569
+ if (*p == '*' && *(p + 1) == '/') {
570
+ p += 2;
571
+ break;
572
+ }
573
+ p++;
574
+ }
575
+ continue;
576
+ }
577
+
578
+ switch (state) {
579
+ case STATE_INITIAL:
580
+ if (c == '@') {
581
+ // @rule detected - parse it
582
+ const char *at_start = p + 1; // Skip @
583
+ const char *at_end = at_start;
584
+
585
+ // Find end of @rule name (until space or {)
586
+ while (at_end < pe && !IS_WHITESPACE(*at_end) && *at_end != '{' && *at_end != ';') {
587
+ at_end++;
588
+ }
589
+
590
+ long name_len = at_end - at_start;
591
+ char at_name[256];
592
+ if (name_len > 255) name_len = 255;
593
+ strncpy(at_name, at_start, name_len);
594
+ at_name[name_len] = '\0';
595
+
596
+ DEBUG_PRINTF("[pure_c] @rule detected: @%s at pos %td\n", at_name, (ptrdiff_t)(p - css_string_base));
597
+
598
+ // Skip to prelude start (after name, before {)
599
+ p = at_end;
600
+ while (p < pe && IS_WHITESPACE(*p)) p++;
601
+
602
+ const char *prelude_start = p;
603
+
604
+ // Check for statement-style @rule (ends with ;)
605
+ const char *check = p;
606
+ while (check < pe && *check != '{' && *check != ';') check++;
607
+
608
+ if (check >= pe) {
609
+ // Incomplete - skip
610
+ p = pe;
611
+ break;
612
+ }
613
+
614
+ if (*check == ';') {
615
+ // Statement-style @rule (@charset, @import, etc.) - skip it
616
+ p = check + 1;
617
+ DEBUG_PRINTF("[pure_c] Skipped statement @rule @%s\n", at_name);
618
+ break;
619
+ }
620
+
621
+ // Block-style @rule - find prelude end (the {)
622
+ while (p < pe && *p != '{') p++;
623
+
624
+ if (p >= pe) break; // Incomplete
625
+
626
+ const char *prelude_end = p;
627
+
628
+ // Trim whitespace from prelude
629
+ while (prelude_end > prelude_start && IS_WHITESPACE(*(prelude_end - 1))) {
630
+ prelude_end--;
631
+ }
632
+
633
+ long prelude_len = prelude_end - prelude_start;
634
+
635
+ p++; // Skip opening {
636
+
637
+ // Find matching closing brace
638
+ int brace_depth = 1;
639
+ const char *block_start = p;
640
+
641
+ while (p < pe && brace_depth > 0) {
642
+ if (*p == '{') {
643
+ brace_depth++;
644
+ } else if (*p == '}') {
645
+ brace_depth--;
646
+ } else if (*p == '/' && p + 1 < pe && *(p + 1) == '*') {
647
+ // Skip comments when counting braces
648
+ p += 2;
649
+ while (p + 1 < pe && !(*p == '*' && *(p + 1) == '/')) p++;
650
+ if (p + 1 < pe) p += 2;
651
+ continue;
652
+ }
653
+ p++;
654
+ }
655
+
656
+ const char *block_end = p - 1; // Before closing }
657
+ long block_len = block_end - block_start;
658
+
659
+ DEBUG_PRINTF("[pure_c] @%s block: %ld bytes\n", at_name, block_len);
660
+
661
+ // Process based on @rule type
662
+ if (strcmp(at_name, "media") == 0) {
663
+ // Parse media query for this block
664
+ VALUE media_query = parse_media_query(prelude_start, prelude_len);
665
+
666
+ // Combine with parent media query if nested (per W3C spec)
667
+ VALUE combined_media_query = media_query;
668
+ if (!NIL_P(parent_media_query)) {
669
+ VALUE parent_qs = rb_hash_aref(parent_media_query, ID2SYM(rb_intern("query_string")));
670
+ VALUE current_qs = rb_hash_aref(media_query, ID2SYM(rb_intern("query_string")));
671
+
672
+ // Combine: "screen" + " and " + "(min-width: 768px)" = "screen and (min-width: 768px)"
673
+ VALUE combined_qs = rb_str_new_cstr("");
674
+ if (!NIL_P(parent_qs)) rb_str_append(combined_qs, parent_qs);
675
+ if (!NIL_P(parent_qs) && !NIL_P(current_qs)) rb_str_cat2(combined_qs, " and ");
676
+ if (!NIL_P(current_qs)) rb_str_append(combined_qs, current_qs);
677
+
678
+ // Combine media_types arrays (union of parent and current)
679
+ VALUE parent_media_types = rb_hash_aref(parent_media_query, ID2SYM(rb_intern("media_types")));
680
+ VALUE current_media_types = rb_hash_aref(media_query, ID2SYM(rb_intern("media_types")));
681
+ VALUE combined_media_types = rb_ary_dup(parent_media_types);
682
+
683
+ // Add current media types if they're not already in the array
684
+ long current_len = RARRAY_LEN(current_media_types);
685
+ for (long i = 0; i < current_len; i++) {
686
+ VALUE media_type = RARRAY_AREF(current_media_types, i);
687
+ if (!rb_ary_includes(combined_media_types, media_type)) {
688
+ rb_ary_push(combined_media_types, media_type);
689
+ }
690
+ }
691
+
692
+ combined_media_query = rb_hash_new();
693
+ rb_hash_aset(combined_media_query, ID2SYM(rb_intern("query_string")), combined_qs);
694
+ rb_hash_aset(combined_media_query, ID2SYM(rb_intern("media_types")), combined_media_types);
695
+ }
696
+
697
+ // Recursively parse block content with combined media context
698
+ VALUE block_content = rb_str_new(block_start, block_len);
699
+ VALUE inner_hash = parse_css_impl(block_content, depth + 1, combined_media_query);
700
+
701
+ // Merge inner_hash into our rules_by_media using rb_hash_foreach
702
+ struct merge_hash_ctx merge_ctx = { rules_by_media };
703
+ rb_hash_foreach(inner_hash, merge_hash_callback, (VALUE)&merge_ctx);
704
+
705
+ RB_GC_GUARD(media_query);
706
+ RB_GC_GUARD(combined_media_query);
707
+ RB_GC_GUARD(block_content);
708
+ RB_GC_GUARD(inner_hash);
709
+
710
+ } else if (strcmp(at_name, "supports") == 0 || strcmp(at_name, "layer") == 0 ||
711
+ strcmp(at_name, "container") == 0 || strcmp(at_name, "scope") == 0) {
712
+ // Conditional group rules - recursively parse and merge
713
+ VALUE block_content = rb_str_new(block_start, block_len);
714
+ VALUE inner_hash = parse_css_impl(block_content, depth + 1, parent_media_query);
715
+
716
+ // Merge inner_hash into rules_by_media using rb_hash_foreach
717
+ struct merge_hash_ctx merge_ctx = { rules_by_media };
718
+ rb_hash_foreach(inner_hash, merge_hash_callback, (VALUE)&merge_ctx);
719
+
720
+ RB_GC_GUARD(block_content);
721
+ RB_GC_GUARD(inner_hash);
722
+
723
+ } else if (strstr(at_name, "keyframes") != NULL) {
724
+ // @keyframes - create dummy rule with animation name
725
+ // Strip whitespace without rb_funcall
726
+ VALUE animation_name = strip_string(prelude_start, prelude_len);
727
+
728
+ // Build selector: "@keyframes " + name
729
+ VALUE sel = UTF8_STR("@");
730
+ rb_str_cat(sel, at_name, strlen(at_name));
731
+ rb_str_cat2(sel, " ");
732
+ rb_str_append(sel, animation_name);
733
+
734
+ VALUE rule = rb_struct_new(cRule,
735
+ sel, // selector
736
+ rb_ary_new(), // declarations (empty)
737
+ Qnil // specificity
738
+ );
739
+
740
+ // Add to rules_by_media under current media context
741
+ VALUE query_string = NIL_P(parent_media_query) ? Qnil :
742
+ rb_hash_aref(parent_media_query, ID2SYM(rb_intern("query_string")));
743
+ VALUE media_types_array = NIL_P(parent_media_query) ?
744
+ rb_ary_new_from_args(1, ID2SYM(rb_intern("all"))) :
745
+ rb_hash_aref(parent_media_query, ID2SYM(rb_intern("media_types")));
746
+
747
+ // Get or create group
748
+ VALUE group = rb_hash_aref(rules_by_media, query_string);
749
+ if (NIL_P(group)) {
750
+ group = rb_hash_new();
751
+ rb_hash_aset(group, ID2SYM(rb_intern("media_types")), media_types_array);
752
+ rb_hash_aset(group, ID2SYM(rb_intern("rules")), rb_ary_new());
753
+ rb_hash_aset(rules_by_media, query_string, group);
754
+ }
755
+
756
+ VALUE rules_array = rb_hash_aref(group, ID2SYM(rb_intern("rules")));
757
+ rb_ary_push(rules_array, rule);
758
+
759
+ RB_GC_GUARD(animation_name);
760
+ RB_GC_GUARD(sel);
761
+ RB_GC_GUARD(rule);
762
+
763
+ } else if (strcmp(at_name, "font-face") == 0 || strcmp(at_name, "property") == 0 ||
764
+ strcmp(at_name, "page") == 0 || strcmp(at_name, "counter-style") == 0) {
765
+ // Descriptor-based @rules - parse block as declarations
766
+ // Wrap in dummy selector for parsing
767
+ VALUE wrapped = UTF8_STR("* { ");
768
+ rb_str_cat(wrapped, block_start, block_len);
769
+ rb_str_cat2(wrapped, " }");
770
+
771
+ VALUE dummy_hash = parse_css_impl(wrapped, depth + 1, parent_media_query);
772
+ VALUE declarations = Qnil;
773
+
774
+ // Extract first rule from the dummy parse (should be under nil key)
775
+ // dummy_hash structure: {query_string => {media_types: [...], rules: [...]}}
776
+ VALUE dummy_group = rb_hash_aref(dummy_hash, Qnil);
777
+ if (!NIL_P(dummy_group)) {
778
+ VALUE dummy_rules = rb_hash_aref(dummy_group, ID2SYM(rb_intern("rules")));
779
+ if (!NIL_P(dummy_rules) && RARRAY_LEN(dummy_rules) > 0) {
780
+ VALUE first_rule = RARRAY_AREF(dummy_rules, 0);
781
+ declarations = rb_struct_aref(first_rule, INT2FIX(RULE_DECLARATIONS));
782
+
783
+ // Build selector: "@" + name + [" " + prelude]
784
+ VALUE sel = UTF8_STR("@");
785
+ rb_str_cat(sel, at_name, strlen(at_name));
786
+
787
+ if (prelude_len > 0) {
788
+ // Strip whitespace without rb_funcall
789
+ VALUE prelude_val = strip_string(prelude_start, prelude_len);
790
+ if (RSTRING_LEN(prelude_val) > 0) {
791
+ rb_str_cat2(sel, " ");
792
+ rb_str_append(sel, prelude_val);
793
+ }
794
+ RB_GC_GUARD(prelude_val);
795
+ }
796
+
797
+ VALUE rule = rb_struct_new(cRule,
798
+ sel, // selector
799
+ declarations, // declarations
800
+ Qnil // specificity
801
+ );
802
+
803
+ // Add to rules_by_media under current media context
804
+ VALUE query_string = NIL_P(parent_media_query) ? Qnil :
805
+ rb_hash_aref(parent_media_query, ID2SYM(rb_intern("query_string")));
806
+ VALUE media_types_array = NIL_P(parent_media_query) ?
807
+ rb_ary_new_from_args(1, ID2SYM(rb_intern("all"))) :
808
+ rb_hash_aref(parent_media_query, ID2SYM(rb_intern("media_types")));
809
+
810
+ // Get or create group
811
+ VALUE group = rb_hash_aref(rules_by_media, query_string);
812
+ if (NIL_P(group)) {
813
+ group = rb_hash_new();
814
+ rb_hash_aset(group, ID2SYM(rb_intern("media_types")), media_types_array);
815
+ rb_hash_aset(group, ID2SYM(rb_intern("rules")), rb_ary_new());
816
+ rb_hash_aset(rules_by_media, query_string, group);
817
+ }
818
+
819
+ VALUE rules_array = rb_hash_aref(group, ID2SYM(rb_intern("rules")));
820
+ rb_ary_push(rules_array, rule);
821
+
822
+ RB_GC_GUARD(sel);
823
+ RB_GC_GUARD(rule);
824
+ }
825
+ }
826
+
827
+ RB_GC_GUARD(wrapped);
828
+ RB_GC_GUARD(dummy_hash);
829
+ RB_GC_GUARD(dummy_group);
830
+ RB_GC_GUARD(declarations);
831
+
832
+ } else {
833
+ // Unknown @rule - skip it
834
+ DEBUG_PRINTF("[pure_c] Skipping unknown @rule: @%s\n", at_name);
835
+ }
836
+
837
+ } else if (c == '}') {
838
+ // Stray closing brace - ignore
839
+ p++;
840
+ } else if (!IS_WHITESPACE(c)) {
841
+ // Start of selector
842
+ selector_start = p;
843
+ state = STATE_SELECTOR;
844
+ DEBUG_PRINTF("[pure_c] Starting selector at pos %td\n", (ptrdiff_t)(p - css_string_base));
845
+ }
846
+ break;
847
+
848
+ case STATE_SELECTOR:
849
+ if (c == '{') {
850
+ // End of selector, start of declarations
851
+ if (selector_start != NULL) {
852
+ const char *selector_end = p;
853
+
854
+ // Trim trailing whitespace
855
+ while (selector_end > selector_start && IS_WHITESPACE(*(selector_end - 1))) {
856
+ selector_end--;
857
+ }
858
+
859
+ // Split on comma and capture each selector
860
+ const char *seg_start = selector_start;
861
+ const char *seg = selector_start;
862
+
863
+ if (NIL_P(current_selectors)) {
864
+ current_selectors = rb_ary_new();
865
+ }
866
+
867
+ while (seg <= selector_end) {
868
+ if (seg == selector_end || *seg == ',') {
869
+ // Capture segment
870
+ const char *seg_end = seg;
871
+
872
+ // Trim whitespace from segment
873
+ while (seg_end > seg_start && IS_WHITESPACE(*(seg_end - 1))) {
874
+ seg_end--;
875
+ }
876
+ while (seg_start < seg_end && IS_WHITESPACE(*seg_start)) {
877
+ seg_start++;
878
+ }
879
+
880
+ if (seg_end > seg_start) {
881
+ VALUE sel = rb_utf8_str_new(seg_start, seg_end - seg_start);
882
+ rb_ary_push(current_selectors, sel);
883
+ DEBUG_PRINTF("[pure_c] Captured selector: '%s'\n", RSTRING_PTR(sel));
884
+ }
885
+
886
+ seg_start = seg + 1; // Skip comma
887
+ }
888
+ seg++;
889
+ }
890
+
891
+ selector_start = NULL;
892
+ }
893
+
894
+ p++; // Skip {
895
+ decl_start = p;
896
+ state = STATE_DECLARATIONS;
897
+ DEBUG_PRINTF("[pure_c] Starting declarations at pos %td\n", (ptrdiff_t)(p - css_string_base));
898
+ } else {
899
+ // Continue parsing selector
900
+ p++;
901
+ }
902
+ break;
903
+
904
+ case STATE_DECLARATIONS:
905
+ if (c == '}') {
906
+ // End of declaration block
907
+ // Capture declarations
908
+ capture_declarations_fn(&decl_start, p, &current_declarations, css_string_base);
909
+
910
+ // Create rule(s)
911
+ finish_rule_fn(0, &current_selectors, &current_declarations,
912
+ &current_media_types, rules_by_media, &mark);
913
+
914
+ p++; // Skip }
915
+ state = STATE_INITIAL;
916
+ DEBUG_PRINTF("[pure_c] Finished rule, back to initial at pos %td\n", (ptrdiff_t)(p - css_string_base));
917
+ } else {
918
+ // Continue parsing declarations
919
+ p++;
920
+ }
921
+ break;
922
+ }
923
+ }
924
+
925
+ // Cleanup: if we ended in the middle of parsing, try to finish
926
+ if (state == STATE_DECLARATIONS && decl_start != NULL) {
927
+ capture_declarations_fn(&decl_start, p, &current_declarations, css_string_base);
928
+ finish_rule_fn(0, &current_selectors, &current_declarations,
929
+ &current_media_types, rules_by_media, &mark);
930
+ }
931
+
932
+ return rules_by_media;
933
+ }