react-native-enriched 0.4.1 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +27 -2
  2. package/ReactNativeEnriched.podspec +5 -1
  3. package/android/generated/java/com/facebook/react/viewmanagers/EnrichedTextInputViewManagerDelegate.java +12 -0
  4. package/android/generated/java/com/facebook/react/viewmanagers/EnrichedTextInputViewManagerInterface.java +4 -0
  5. package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/EventEmitters.cpp +149 -0
  6. package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/EventEmitters.h +146 -0
  7. package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/Props.cpp +16 -1
  8. package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/Props.h +46 -0
  9. package/android/src/main/java/com/swmansion/enriched/common/GumboNormalizer.kt +5 -0
  10. package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedCheckboxListSpan.kt +3 -2
  11. package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedUnorderedListSpan.kt +2 -1
  12. package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputView.kt +166 -20
  13. package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputViewManager.kt +32 -0
  14. package/android/src/main/java/com/swmansion/enriched/textinput/MeasurementStore.kt +19 -2
  15. package/android/src/main/java/com/swmansion/enriched/textinput/events/OnContextMenuItemPressEvent.kt +35 -0
  16. package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedLineHeightSpan.kt +44 -0
  17. package/android/src/main/java/com/swmansion/enriched/textinput/styles/ParametrizedStyles.kt +16 -0
  18. package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedSpanState.kt +18 -12
  19. package/android/src/main/new_arch/CMakeLists.txt +9 -13
  20. package/android/src/main/new_arch/GumboNormalizerJni.cpp +14 -0
  21. package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/conversions.h +2 -21
  22. package/cpp/CMakeLists.txt +50 -0
  23. package/cpp/GumboParser/GumboParser.h +34043 -0
  24. package/cpp/README.md +59 -0
  25. package/cpp/parser/GumboNormalizer.c +915 -0
  26. package/cpp/parser/GumboParser.cpp +16 -0
  27. package/cpp/parser/GumboParser.hpp +23 -0
  28. package/cpp/tests/GumboParserTest.cpp +457 -0
  29. package/ios/EnrichedTextInputView.h +2 -0
  30. package/ios/EnrichedTextInputView.mm +152 -2
  31. package/ios/config/InputConfig.h +3 -0
  32. package/ios/config/InputConfig.mm +15 -0
  33. package/ios/extensions/LayoutManagerExtension.mm +34 -11
  34. package/ios/generated/ReactNativeEnrichedSpec/EventEmitters.cpp +149 -0
  35. package/ios/generated/ReactNativeEnrichedSpec/EventEmitters.h +146 -0
  36. package/ios/generated/ReactNativeEnrichedSpec/Props.cpp +16 -1
  37. package/ios/generated/ReactNativeEnrichedSpec/Props.h +46 -0
  38. package/ios/generated/ReactNativeEnrichedSpec/RCTComponentViewHelpers.h +29 -0
  39. package/ios/inputParser/InputParser.mm +27 -0
  40. package/ios/inputTextView/InputTextView.mm +3 -3
  41. package/lib/module/EnrichedTextInput.js +43 -30
  42. package/lib/module/EnrichedTextInput.js.map +1 -1
  43. package/lib/module/spec/EnrichedTextInputNativeComponent.ts +118 -22
  44. package/lib/typescript/src/EnrichedTextInput.d.ts +24 -6
  45. package/lib/typescript/src/EnrichedTextInput.d.ts.map +1 -1
  46. package/lib/typescript/src/index.d.ts +1 -1
  47. package/lib/typescript/src/index.d.ts.map +1 -1
  48. package/lib/typescript/src/spec/EnrichedTextInputNativeComponent.d.ts +111 -21
  49. package/lib/typescript/src/spec/EnrichedTextInputNativeComponent.d.ts.map +1 -1
  50. package/package.json +5 -5
  51. package/src/EnrichedTextInput.tsx +79 -40
  52. package/src/index.tsx +0 -1
  53. package/src/spec/EnrichedTextInputNativeComponent.ts +118 -22
@@ -0,0 +1,915 @@
1
+ /**
2
+ * GumboNormalizer.c
3
+ *
4
+ * Gumbo-based HTML normalizer (C implementation).
5
+ * Converts arbitrary external HTML into the canonical subset that our enriched
6
+ * parser understands.
7
+ */
8
+
9
+ #define GUMBO_IMPLEMENTATION
10
+
11
+ #ifdef __clang__
12
+ #pragma clang diagnostic push
13
+ #pragma clang diagnostic ignored "-Weverything"
14
+ #elif defined(__GNUC__)
15
+ #pragma GCC diagnostic push
16
+ #pragma GCC diagnostic ignored "-Wall"
17
+ #pragma GCC diagnostic ignored "-Wextra"
18
+ #endif
19
+
20
+ #include "GumboParser.h"
21
+
22
+ #ifdef __clang__
23
+ #pragma clang diagnostic pop
24
+ #elif defined(__GNUC__)
25
+ #pragma GCC diagnostic pop
26
+ #endif
27
+
28
+ #include <ctype.h>
29
+ #include <stdlib.h>
30
+ #include <string.h>
31
+
32
+ /* ------------------------------------------------------------------ */
33
+ /* Dynamic string buffer */
34
+ /* ------------------------------------------------------------------ */
35
+
36
+ typedef struct {
37
+ char *data;
38
+ size_t len;
39
+ size_t cap;
40
+ } buffer_t;
41
+
42
+ static buffer_t buffer_create(size_t initial_cap) {
43
+ buffer_t b;
44
+ b.cap = initial_cap > 64 ? initial_cap : 64;
45
+ b.data = (char *)malloc(b.cap);
46
+ b.len = 0;
47
+ if (b.data)
48
+ b.data[0] = '\0';
49
+ return b;
50
+ }
51
+
52
+ static void buffer_ensure(buffer_t *b, size_t extra) {
53
+ if (b->len + extra + 1 > b->cap) {
54
+ while (b->len + extra + 1 > b->cap)
55
+ b->cap *= 2;
56
+ b->data = (char *)realloc(b->data, b->cap);
57
+ }
58
+ }
59
+
60
+ static void buffer_append(buffer_t *b, const char *s, size_t n) {
61
+ if (!s || n == 0)
62
+ return;
63
+ buffer_ensure(b, n);
64
+ memcpy(b->data + b->len, s, n);
65
+ b->len += n;
66
+ b->data[b->len] = '\0';
67
+ }
68
+
69
+ static void buffer_append_str(buffer_t *b, const char *s) {
70
+ if (!s)
71
+ return;
72
+ buffer_append(b, s, strlen(s));
73
+ }
74
+
75
+ static void buffer_clear(buffer_t *b) {
76
+ b->len = 0;
77
+ b->data[0] = '\0';
78
+ }
79
+
80
+ static char *buffer_finish(buffer_t *b) { return b->data; /* caller owns */ }
81
+
82
+ /* ------------------------------------------------------------------ */
83
+ /* Tag classification helpers */
84
+ /* ------------------------------------------------------------------ */
85
+
86
+ typedef enum {
87
+ TAG_CLASS_SKIP, /* tag stripped, children processed */
88
+ TAG_CLASS_INLINE, /* canonical inline tag */
89
+ TAG_CLASS_BLOCK, /* canonical block tag */
90
+ TAG_CLASS_SELF_CLOSING, /* e.g. <br>, <img> */
91
+ TAG_CLASS_PASS, /* pass-through (e.g. <html>, <body>) */
92
+ } tag_class_t;
93
+
94
+ static const char *canonical_name(const char *name) {
95
+ if (strcmp(name, "strong") == 0)
96
+ return "b";
97
+ if (strcmp(name, "em") == 0)
98
+ return "i";
99
+ if (strcmp(name, "del") == 0 || strcmp(name, "strike") == 0)
100
+ return "s";
101
+ if (strcmp(name, "ins") == 0)
102
+ return "u";
103
+ if (strcmp(name, "pre") == 0)
104
+ return "codeblock";
105
+ return NULL;
106
+ }
107
+
108
+ static tag_class_t classify_tag(const char *name) {
109
+ if (strcmp(name, "b") == 0 || strcmp(name, "i") == 0 ||
110
+ strcmp(name, "u") == 0 || strcmp(name, "s") == 0 ||
111
+ strcmp(name, "code") == 0 || strcmp(name, "a") == 0 ||
112
+ strcmp(name, "strong") == 0 || strcmp(name, "em") == 0 ||
113
+ strcmp(name, "del") == 0 || strcmp(name, "strike") == 0 ||
114
+ strcmp(name, "ins") == 0 || strcmp(name, "mention") == 0)
115
+ return TAG_CLASS_INLINE;
116
+
117
+ if (strcmp(name, "p") == 0 || strcmp(name, "h1") == 0 ||
118
+ strcmp(name, "h2") == 0 || strcmp(name, "h3") == 0 ||
119
+ strcmp(name, "h4") == 0 || strcmp(name, "h5") == 0 ||
120
+ strcmp(name, "h6") == 0 || strcmp(name, "ul") == 0 ||
121
+ strcmp(name, "ol") == 0 || strcmp(name, "li") == 0 ||
122
+ strcmp(name, "blockquote") == 0 || strcmp(name, "codeblock") == 0 ||
123
+ strcmp(name, "pre") == 0)
124
+ return TAG_CLASS_BLOCK;
125
+
126
+ if (strcmp(name, "br") == 0 || strcmp(name, "img") == 0)
127
+ return TAG_CLASS_SELF_CLOSING;
128
+
129
+ if (strcmp(name, "html") == 0 || strcmp(name, "head") == 0 ||
130
+ strcmp(name, "body") == 0)
131
+ return TAG_CLASS_PASS;
132
+
133
+ return TAG_CLASS_SKIP;
134
+ }
135
+
136
+ /* ------------------------------------------------------------------ */
137
+ /* DOM helpers — get tag name, node type checks */
138
+ /* ------------------------------------------------------------------ */
139
+
140
+ static bool is_element(GumboNode *node) {
141
+ return node && (node->type == GUMBO_NODE_ELEMENT ||
142
+ node->type == GUMBO_NODE_TEMPLATE);
143
+ }
144
+
145
+ static bool is_text(GumboNode *node) {
146
+ return node &&
147
+ (node->type == GUMBO_NODE_TEXT || node->type == GUMBO_NODE_WHITESPACE);
148
+ }
149
+
150
+ /** Get the lowercased tag name of an element node into buf. */
151
+ static const char *get_tag_name(GumboNode *node, char *buf, size_t buf_sz) {
152
+ if (!is_element(node))
153
+ return NULL;
154
+ GumboElement *el = &node->v.element;
155
+
156
+ if (el->tag != GUMBO_TAG_UNKNOWN) {
157
+ const char *name = gumbo_normalized_tagname(el->tag);
158
+ if (name && name[0]) {
159
+ size_t n = strlen(name);
160
+ if (n >= buf_sz)
161
+ n = buf_sz - 1;
162
+ memcpy(buf, name, n);
163
+ buf[n] = '\0';
164
+ return buf;
165
+ }
166
+ }
167
+
168
+ /* Unknown tag — extract from original_tag */
169
+ GumboStringPiece piece = el->original_tag;
170
+ gumbo_tag_from_original_text(&piece);
171
+ if (piece.data && piece.length > 0) {
172
+ size_t n = piece.length < buf_sz - 1 ? piece.length : buf_sz - 1;
173
+ for (size_t i = 0; i < n; i++)
174
+ buf[i] = (char)tolower((unsigned char)piece.data[i]);
175
+ buf[n] = '\0';
176
+ return buf;
177
+ }
178
+ return NULL;
179
+ }
180
+
181
+ static bool is_list_node(GumboNode *node) {
182
+ char buf[64];
183
+ const char *n = get_tag_name(node, buf, sizeof(buf));
184
+ return n && (strcmp(n, "ul") == 0 || strcmp(n, "ol") == 0);
185
+ }
186
+
187
+ static bool is_blockquote_node(GumboNode *node) {
188
+ char buf[64];
189
+ const char *n = get_tag_name(node, buf, sizeof(buf));
190
+ return n && strcmp(n, "blockquote") == 0;
191
+ }
192
+
193
+ static bool is_br_node(GumboNode *node) {
194
+ char buf[64];
195
+ const char *n = get_tag_name(node, buf, sizeof(buf));
196
+ return n && strcmp(n, "br") == 0;
197
+ }
198
+
199
+ static bool is_block_producing(GumboNode *node) {
200
+ char buf[64];
201
+ const char *n = get_tag_name(node, buf, sizeof(buf));
202
+ if (!n)
203
+ return false;
204
+ if (classify_tag(n) == TAG_CLASS_BLOCK)
205
+ return true;
206
+ return strcmp(n, "div") == 0 || strcmp(n, "table") == 0 ||
207
+ strcmp(n, "tr") == 0;
208
+ }
209
+
210
+ /** True if all children are inline/text (no block-producing elements). */
211
+ static bool is_purely_inline(GumboNode *node) {
212
+ if (!is_element(node))
213
+ return true;
214
+ GumboVector *children = &node->v.element.children;
215
+ for (unsigned int i = 0; i < children->length; i++) {
216
+ if (is_block_producing(children->data[i]))
217
+ return false;
218
+ }
219
+ return true;
220
+ }
221
+
222
+ /** True if any direct child is block-producing or a blockquote. */
223
+ static bool has_block_or_bq_child(GumboNode *node) {
224
+ if (!is_element(node))
225
+ return false;
226
+ GumboVector *children = &node->v.element.children;
227
+ for (unsigned int i = 0; i < children->length; i++) {
228
+ GumboNode *c = children->data[i];
229
+ if (is_block_producing(c) || is_blockquote_node(c))
230
+ return true;
231
+ }
232
+ return false;
233
+ }
234
+
235
+ /* ------------------------------------------------------------------ */
236
+ /* CSS style → canonical tag mapping (simple string matching) */
237
+ /* ------------------------------------------------------------------ */
238
+
239
+ typedef struct {
240
+ bool bold;
241
+ bool italic;
242
+ bool underline;
243
+ bool strikethrough;
244
+ } css_styles_t;
245
+
246
+ static const char *find_css_value(const char *style, size_t style_len,
247
+ const char *prop_name, size_t *val_len) {
248
+ size_t plen = strlen(prop_name);
249
+ const char *end = style + style_len;
250
+ const char *p = style;
251
+ while (p < end) {
252
+ while (p < end &&
253
+ (*p == ' ' || *p == '\t' || *p == ';' || *p == '\n' || *p == '\r'))
254
+ p++;
255
+ if (p >= end)
256
+ break;
257
+ if ((size_t)(end - p) > plen && strncmp(p, prop_name, plen) == 0) {
258
+ const char *after = p + plen;
259
+ while (after < end && (*after == ' ' || *after == '\t'))
260
+ after++;
261
+ if (after < end && *after == ':') {
262
+ after++;
263
+ while (after < end && (*after == ' ' || *after == '\t'))
264
+ after++;
265
+ const char *val_start = after;
266
+ while (after < end && *after != ';')
267
+ after++;
268
+ const char *val_end = after;
269
+ while (val_end > val_start &&
270
+ (*(val_end - 1) == ' ' || *(val_end - 1) == '\t'))
271
+ val_end--;
272
+ *val_len = (size_t)(val_end - val_start);
273
+ return val_start;
274
+ }
275
+ }
276
+ while (p < end && *p != ';')
277
+ p++;
278
+ if (p < end)
279
+ p++;
280
+ }
281
+ return NULL;
282
+ }
283
+
284
+ static bool css_val_contains(const char *val, size_t val_len,
285
+ const char *needle) {
286
+ size_t nlen = strlen(needle);
287
+ if (nlen > val_len)
288
+ return false;
289
+ for (size_t i = 0; i <= val_len - nlen; i++) {
290
+ bool match = true;
291
+ for (size_t j = 0; j < nlen; j++) {
292
+ if (tolower((unsigned char)val[i + j]) !=
293
+ tolower((unsigned char)needle[j])) {
294
+ match = false;
295
+ break;
296
+ }
297
+ }
298
+ if (match)
299
+ return true;
300
+ }
301
+ return false;
302
+ }
303
+
304
+ static css_styles_t parse_css_style(const char *style_value, size_t style_len) {
305
+ css_styles_t result = {false, false, false, false};
306
+ if (!style_value || style_len == 0)
307
+ return result;
308
+
309
+ size_t vlen;
310
+ const char *val;
311
+
312
+ val = find_css_value(style_value, style_len, "font-weight", &vlen);
313
+ if (val) {
314
+ if (css_val_contains(val, vlen, "bold") ||
315
+ css_val_contains(val, vlen, "bolder")) {
316
+ result.bold = true;
317
+ } else {
318
+ int num = atoi(val);
319
+ if (num >= 700)
320
+ result.bold = true;
321
+ }
322
+ }
323
+
324
+ val = find_css_value(style_value, style_len, "font-style", &vlen);
325
+ if (val) {
326
+ if (css_val_contains(val, vlen, "italic") ||
327
+ css_val_contains(val, vlen, "oblique"))
328
+ result.italic = true;
329
+ }
330
+
331
+ {
332
+ const char *search_start = style_value;
333
+ size_t search_remaining = style_len;
334
+ while (search_remaining > 0) {
335
+ val = find_css_value(search_start, search_remaining,
336
+ "text-decoration-line", &vlen);
337
+ if (!val)
338
+ val = find_css_value(search_start, search_remaining, "text-decoration",
339
+ &vlen);
340
+ if (!val)
341
+ break;
342
+ if (css_val_contains(val, vlen, "underline"))
343
+ result.underline = true;
344
+ if (css_val_contains(val, vlen, "line-through"))
345
+ result.strikethrough = true;
346
+ size_t consumed = (size_t)(val + vlen - search_start);
347
+ if (consumed >= search_remaining)
348
+ break;
349
+ search_start = val + vlen;
350
+ search_remaining = style_len - (size_t)(search_start - style_value);
351
+ }
352
+ }
353
+
354
+ return result;
355
+ }
356
+
357
+ static css_styles_t extra_styles(css_styles_t s, const char *tag) {
358
+ if (strcmp(tag, "b") == 0)
359
+ s.bold = false;
360
+ if (strcmp(tag, "i") == 0)
361
+ s.italic = false;
362
+ if (strcmp(tag, "u") == 0)
363
+ s.underline = false;
364
+ if (strcmp(tag, "s") == 0)
365
+ s.strikethrough = false;
366
+ return s;
367
+ }
368
+
369
+ static void emit_styles_open(buffer_t *out, css_styles_t s) {
370
+ if (s.bold)
371
+ buffer_append_str(out, "<b>");
372
+ if (s.italic)
373
+ buffer_append_str(out, "<i>");
374
+ if (s.underline)
375
+ buffer_append_str(out, "<u>");
376
+ if (s.strikethrough)
377
+ buffer_append_str(out, "<s>");
378
+ }
379
+
380
+ static void emit_styles_close(buffer_t *out, css_styles_t s) {
381
+ if (s.strikethrough)
382
+ buffer_append_str(out, "</s>");
383
+ if (s.underline)
384
+ buffer_append_str(out, "</u>");
385
+ if (s.italic)
386
+ buffer_append_str(out, "</i>");
387
+ if (s.bold)
388
+ buffer_append_str(out, "</b>");
389
+ }
390
+
391
+ /* ------------------------------------------------------------------ */
392
+ /* Attribute emission helpers */
393
+ /* ------------------------------------------------------------------ */
394
+
395
+ static const char *get_attr(GumboElement *el, const char *name) {
396
+ GumboAttribute *attr = gumbo_get_attribute(&el->attributes, name);
397
+ if (attr)
398
+ return attr->value;
399
+ return NULL;
400
+ }
401
+
402
+ static void emit_one_attr(buffer_t *out, GumboElement *el,
403
+ const char *attr_name) {
404
+ const char *val = get_attr(el, attr_name);
405
+ if (val && val[0]) {
406
+ buffer_append_str(out, " ");
407
+ buffer_append_str(out, attr_name);
408
+ buffer_append_str(out, "=\"");
409
+ buffer_append_str(out, val);
410
+ buffer_append_str(out, "\"");
411
+ }
412
+ }
413
+
414
+ static void emit_attributes(GumboElement *el, const char *tag_name,
415
+ buffer_t *out) {
416
+ if (strcmp(tag_name, "a") == 0) {
417
+ emit_one_attr(out, el, "href");
418
+ } else if (strcmp(tag_name, "img") == 0) {
419
+ emit_one_attr(out, el, "src");
420
+ emit_one_attr(out, el, "alt");
421
+ emit_one_attr(out, el, "width");
422
+ emit_one_attr(out, el, "height");
423
+ } else if (strcmp(tag_name, "ul") == 0) {
424
+ const char *val = get_attr(el, "data-type");
425
+ if (val && strcmp(val, "checkbox") == 0)
426
+ buffer_append_str(out, " data-type=\"checkbox\"");
427
+ } else if (strcmp(tag_name, "li") == 0) {
428
+ if (gumbo_get_attribute(&el->attributes, "checked") != NULL)
429
+ buffer_append_str(out, " checked");
430
+ } else if (strcmp(tag_name, "mention") == 0) {
431
+ emit_one_attr(out, el, "id");
432
+ emit_one_attr(out, el, "text");
433
+ emit_one_attr(out, el, "indicator");
434
+ }
435
+ }
436
+
437
+ /* ------------------------------------------------------------------ */
438
+ /* Google Docs specific handling */
439
+ /* ------------------------------------------------------------------ */
440
+
441
+ static bool is_google_docs_wrapper(GumboElement *el, const char *tag_name) {
442
+ if (strcmp(tag_name, "b") != 0)
443
+ return false;
444
+ const char *id_val = get_attr(el, "id");
445
+ if (!id_val)
446
+ return false;
447
+ size_t id_len = strlen(id_val);
448
+ return (id_len > 20 && strncmp(id_val, "docs-internal-guid-", 19) == 0);
449
+ }
450
+
451
+ /* ------------------------------------------------------------------ */
452
+ /* Recursive DOM tree walker */
453
+ /* ------------------------------------------------------------------ */
454
+
455
+ static void walk_node(GumboNode *node, buffer_t *out);
456
+
457
+ /* ------------------------------------------------------------------ */
458
+ /* Blockquote content flattening */
459
+ /* ------------------------------------------------------------------ */
460
+
461
+ static void flatten_bq_node(GumboNode *node, buffer_t *ib, buffer_t *out);
462
+
463
+ static void flush_inline_p(buffer_t *ib, buffer_t *out) {
464
+ if (ib->len > 0) {
465
+ buffer_append_str(out, "<p>");
466
+ buffer_append(out, ib->data, ib->len);
467
+ buffer_append_str(out, "</p>");
468
+ buffer_clear(ib);
469
+ }
470
+ }
471
+
472
+ static void flatten_bq_children(GumboNode *node, buffer_t *ib, buffer_t *out) {
473
+ if (!is_element(node))
474
+ return;
475
+ GumboVector *children = &node->v.element.children;
476
+ for (unsigned int i = 0; i < children->length; i++) {
477
+ flatten_bq_node(children->data[i], ib, out);
478
+ }
479
+ }
480
+
481
+ static void flatten_bq_node(GumboNode *node, buffer_t *ib, buffer_t *out) {
482
+ if (!node)
483
+ return;
484
+ if (is_text(node)) {
485
+ walk_node(node, ib);
486
+ return;
487
+ }
488
+ if (!is_element(node)) {
489
+ return;
490
+ }
491
+ if (is_br_node(node)) {
492
+ flush_inline_p(ib, out);
493
+ return;
494
+ }
495
+ if (is_block_producing(node) || is_blockquote_node(node)) {
496
+ flush_inline_p(ib, out);
497
+ flatten_bq_children(node, ib, out);
498
+ flush_inline_p(ib, out);
499
+ return;
500
+ }
501
+ walk_node(node, ib);
502
+ }
503
+
504
+ /* ------------------------------------------------------------------ */
505
+ /* List item content flattening */
506
+ /* ------------------------------------------------------------------ */
507
+
508
+ typedef struct {
509
+ GumboElement *el;
510
+ css_styles_t styles;
511
+ GumboNode **nested_lists;
512
+ int *nested_count;
513
+ int max_nested;
514
+ } li_ctx_t;
515
+
516
+ static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
517
+ li_ctx_t *ctx);
518
+
519
+ static void flush_li_buffer(buffer_t *ib, buffer_t *out, li_ctx_t *ctx) {
520
+ if (ib->len == 0)
521
+ return;
522
+ buffer_append_str(out, "<li");
523
+ emit_attributes(ctx->el, "li", out);
524
+ buffer_append_str(out, ">");
525
+ emit_styles_open(out, ctx->styles);
526
+ buffer_append(out, ib->data, ib->len);
527
+ emit_styles_close(out, ctx->styles);
528
+ buffer_append_str(out, "</li>");
529
+ buffer_clear(ib);
530
+ }
531
+
532
+ static void flatten_li_children(GumboNode *node, buffer_t *ib, buffer_t *out,
533
+ li_ctx_t *ctx) {
534
+ if (!is_element(node))
535
+ return;
536
+ GumboVector *children = &node->v.element.children;
537
+ for (unsigned int i = 0; i < children->length; i++) {
538
+ flatten_li_node(children->data[i], ib, out, ctx);
539
+ }
540
+ }
541
+
542
+ static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
543
+ li_ctx_t *ctx) {
544
+ if (!node)
545
+ return;
546
+ if (is_text(node)) {
547
+ walk_node(node, ib);
548
+ return;
549
+ }
550
+ if (!is_element(node)) {
551
+ flatten_li_children(node, ib, out, ctx);
552
+ return;
553
+ }
554
+ if (is_list_node(node)) {
555
+ if (*ctx->nested_count < ctx->max_nested) {
556
+ ctx->nested_lists[*ctx->nested_count] = node;
557
+ (*ctx->nested_count)++;
558
+ }
559
+ return;
560
+ }
561
+ if (is_br_node(node)) {
562
+ flush_li_buffer(ib, out, ctx);
563
+ return;
564
+ }
565
+ if (is_block_producing(node) || is_blockquote_node(node)) {
566
+ flush_li_buffer(ib, out, ctx);
567
+ flatten_li_children(node, ib, out, ctx);
568
+ flush_li_buffer(ib, out, ctx);
569
+ return;
570
+ }
571
+ walk_node(node, ib);
572
+ }
573
+
574
+ /* ------------------------------------------------------------------ */
575
+ /* walk_children — the main child-iteration driver */
576
+ /* ------------------------------------------------------------------ */
577
+
578
+ static void walk_children(GumboNode *node, buffer_t *out) {
579
+ if (!is_element(node))
580
+ return;
581
+
582
+ GumboVector *children = &node->v.element.children;
583
+ bool parent_is_list = is_list_node(node);
584
+
585
+ /* Detect mixed content: does the parent have any block-producing child? */
586
+ bool has_block = false;
587
+ for (unsigned int j = 0; j < children->length; j++) {
588
+ if (is_block_producing(children->data[j])) {
589
+ has_block = true;
590
+ break;
591
+ }
592
+ }
593
+
594
+ unsigned int i = 0;
595
+ while (i < children->length) {
596
+ GumboNode *child = children->data[i];
597
+
598
+ /* Flatten list-inside-list */
599
+ if (parent_is_list && is_list_node(child)) {
600
+ walk_children(child, out);
601
+ i++;
602
+ continue;
603
+ }
604
+
605
+ /* Merge consecutive blockquotes, flattening content into <p>s */
606
+ if (is_blockquote_node(child)) {
607
+ buffer_append_str(out, "<blockquote>");
608
+ buffer_t bq_ib = buffer_create(64);
609
+ while (i < children->length && is_blockquote_node(children->data[i])) {
610
+ flatten_bq_children(children->data[i], &bq_ib, out);
611
+ i++;
612
+ }
613
+ flush_inline_p(&bq_ib, out);
614
+ free(bq_ib.data);
615
+ buffer_append_str(out, "</blockquote>");
616
+ continue;
617
+ }
618
+
619
+ /* Auto-paragraph: group inline runs into <p> when mixed with blocks */
620
+ if (has_block && !parent_is_list && !is_block_producing(child) &&
621
+ !is_blockquote_node(child)) {
622
+ buffer_t ib = buffer_create(64);
623
+ while (i < children->length && !is_block_producing(children->data[i]) &&
624
+ !is_blockquote_node(children->data[i])) {
625
+ child = children->data[i];
626
+ if (is_br_node(child)) {
627
+ if (ib.len > 0)
628
+ flush_inline_p(&ib, out);
629
+ else
630
+ buffer_append_str(out, "<br>");
631
+ i++;
632
+ continue;
633
+ }
634
+ /* Transparent inline wrapper for block/bq children */
635
+ if (is_element(child) && has_block_or_bq_child(child)) {
636
+ flush_inline_p(&ib, out);
637
+ walk_children(child, out);
638
+ i++;
639
+ continue;
640
+ }
641
+ walk_node(child, &ib);
642
+ i++;
643
+ }
644
+ flush_inline_p(&ib, out);
645
+ free(ib.data);
646
+ continue;
647
+ }
648
+
649
+ walk_node(child, out);
650
+ i++;
651
+ }
652
+ }
653
+
654
+ /* ------------------------------------------------------------------ */
655
+ /* walk_node — process a single DOM node */
656
+ /* ------------------------------------------------------------------ */
657
+
658
+ static void walk_node(GumboNode *node, buffer_t *out) {
659
+ if (!node)
660
+ return;
661
+
662
+ /* Text node */
663
+ if (is_text(node)) {
664
+ const char *text_raw = node->v.text.text;
665
+ if (text_raw) {
666
+ size_t text_len = strlen(text_raw);
667
+ for (size_t i = 0; i < text_len; i++) {
668
+ char c = text_raw[i];
669
+ switch (c) {
670
+ case '<':
671
+ buffer_append_str(out, "&lt;");
672
+ break;
673
+ case '>':
674
+ buffer_append_str(out, "&gt;");
675
+ break;
676
+ case '&':
677
+ buffer_append_str(out, "&amp;");
678
+ break;
679
+ default:
680
+ buffer_append(out, &c, 1);
681
+ break;
682
+ }
683
+ }
684
+ }
685
+ return;
686
+ }
687
+
688
+ if (!is_element(node)) {
689
+ walk_children(node, out);
690
+ return;
691
+ }
692
+
693
+ GumboElement *el = &node->v.element;
694
+ char name_buf[64];
695
+ if (!get_tag_name(node, name_buf, sizeof(name_buf))) {
696
+ walk_children(node, out);
697
+ return;
698
+ }
699
+
700
+ /* Strip <meta>, <style>, <script>, <title>, <link> */
701
+ if (strcmp(name_buf, "meta") == 0 || strcmp(name_buf, "style") == 0 ||
702
+ strcmp(name_buf, "script") == 0 || strcmp(name_buf, "title") == 0 ||
703
+ strcmp(name_buf, "link") == 0)
704
+ return;
705
+
706
+ /* Google Docs wrapper */
707
+ if (is_google_docs_wrapper(el, name_buf)) {
708
+ walk_children(node, out);
709
+ return;
710
+ }
711
+
712
+ const char *out_name = canonical_name(name_buf);
713
+ if (!out_name)
714
+ out_name = name_buf;
715
+
716
+ tag_class_t cls = classify_tag(name_buf);
717
+
718
+ /* --- <span>: CSS style → inline tags --- */
719
+ if (strcmp(name_buf, "span") == 0) {
720
+ const char *sval = get_attr(el, "style");
721
+ size_t slen = sval ? strlen(sval) : 0;
722
+ css_styles_t s = parse_css_style(sval, slen);
723
+ emit_styles_open(out, s);
724
+ walk_children(node, out);
725
+ emit_styles_close(out, s);
726
+ return;
727
+ }
728
+
729
+ /* --- <div>: becomes <p> or passes through --- */
730
+ if (strcmp(name_buf, "div") == 0) {
731
+ const char *sval = get_attr(el, "style");
732
+ size_t slen = sval ? strlen(sval) : 0;
733
+ css_styles_t s = parse_css_style(sval, slen);
734
+
735
+ if (is_purely_inline(node)) {
736
+ /* Split on <br> into separate <p>s */
737
+ buffer_t pb = buffer_create(64);
738
+ GumboVector *div_children = &el->children;
739
+ for (unsigned int di = 0; di < div_children->length; di++) {
740
+ GumboNode *dc = div_children->data[di];
741
+ if (is_br_node(dc)) {
742
+ if (pb.len > 0) {
743
+ buffer_append_str(out, "<p>");
744
+ emit_styles_open(out, s);
745
+ buffer_append(out, pb.data, pb.len);
746
+ emit_styles_close(out, s);
747
+ buffer_append_str(out, "</p>");
748
+ } else {
749
+ buffer_append_str(out, "<br>");
750
+ }
751
+ buffer_clear(&pb);
752
+ continue;
753
+ }
754
+ walk_node(dc, &pb);
755
+ }
756
+ if (pb.len > 0) {
757
+ buffer_append_str(out, "<p>");
758
+ emit_styles_open(out, s);
759
+ buffer_append(out, pb.data, pb.len);
760
+ emit_styles_close(out, s);
761
+ buffer_append_str(out, "</p>");
762
+ }
763
+ free(pb.data);
764
+ } else {
765
+ emit_styles_open(out, s);
766
+ walk_children(node, out);
767
+ emit_styles_close(out, s);
768
+ }
769
+ return;
770
+ }
771
+
772
+ /* --- Table elements --- */
773
+ if (strcmp(name_buf, "table") == 0 || strcmp(name_buf, "thead") == 0 ||
774
+ strcmp(name_buf, "tbody") == 0 || strcmp(name_buf, "tfoot") == 0 ||
775
+ strcmp(name_buf, "tr") == 0 || strcmp(name_buf, "td") == 0 ||
776
+ strcmp(name_buf, "th") == 0 || strcmp(name_buf, "caption") == 0 ||
777
+ strcmp(name_buf, "colgroup") == 0 || strcmp(name_buf, "col") == 0) {
778
+ if (strcmp(name_buf, "td") == 0 || strcmp(name_buf, "th") == 0) {
779
+ walk_children(node, out);
780
+ /* Check if there's a next sibling element */
781
+ GumboNode *parent = node->parent;
782
+ if (parent && is_element(parent)) {
783
+ GumboVector *siblings = &parent->v.element.children;
784
+ unsigned int my_idx = node->index_within_parent;
785
+ /* Find next element sibling */
786
+ bool has_next_el = false;
787
+ for (unsigned int si = my_idx + 1; si < siblings->length; si++) {
788
+ GumboNode *sib = siblings->data[si];
789
+ if (is_element(sib)) {
790
+ has_next_el = true;
791
+ break;
792
+ }
793
+ }
794
+ if (has_next_el)
795
+ buffer_append_str(out, " ");
796
+ }
797
+ } else if (strcmp(name_buf, "tr") == 0) {
798
+ buffer_t row = buffer_create(64);
799
+ walk_children(node, &row);
800
+ if (row.len > 0) {
801
+ buffer_append_str(out, "<p>");
802
+ buffer_append(out, row.data, row.len);
803
+ buffer_append_str(out, "</p>");
804
+ }
805
+ free(row.data);
806
+ } else {
807
+ walk_children(node, out);
808
+ }
809
+ return;
810
+ }
811
+
812
+ /* --- Remaining tags handled by class --- */
813
+ switch (cls) {
814
+ case TAG_CLASS_PASS:
815
+ case TAG_CLASS_SKIP:
816
+ walk_children(node, out);
817
+ break;
818
+
819
+ case TAG_CLASS_SELF_CLOSING:
820
+ buffer_append_str(out, "<");
821
+ buffer_append_str(out, out_name);
822
+ emit_attributes(el, out_name, out);
823
+ buffer_append_str(out, strcmp(out_name, "img") == 0 ? " />" : ">");
824
+ break;
825
+
826
+ case TAG_CLASS_INLINE:
827
+ case TAG_CLASS_BLOCK: {
828
+ const char *sval = get_attr(el, "style");
829
+ size_t slen = sval ? strlen(sval) : 0;
830
+ css_styles_t es = extra_styles(parse_css_style(sval, slen), out_name);
831
+
832
+ /* <li>: always flatten */
833
+ if (strcmp(out_name, "li") == 0) {
834
+ GumboNode *nested_lists[16];
835
+ int nested_count = 0;
836
+ buffer_t li_ib = buffer_create(64);
837
+ li_ctx_t ctx = {el, es, nested_lists, &nested_count, 16};
838
+ flatten_li_children(node, &li_ib, out, &ctx);
839
+ flush_li_buffer(&li_ib, out, &ctx);
840
+ free(li_ib.data);
841
+ for (int k = 0; k < nested_count; k++)
842
+ walk_children(nested_lists[k], out);
843
+ break;
844
+ }
845
+
846
+ /* <codeblock>: wrap inline content in <p> */
847
+ if (strcmp(out_name, "codeblock") == 0) {
848
+ bool wrap = is_purely_inline(node);
849
+ buffer_append_str(out, "<codeblock>");
850
+ if (wrap)
851
+ buffer_append_str(out, "<p>");
852
+ walk_children(node, out);
853
+ if (wrap)
854
+ buffer_append_str(out, "</p>");
855
+ buffer_append_str(out, "</codeblock>");
856
+ break;
857
+ }
858
+
859
+ /* Generic block/inline tag */
860
+ buffer_append_str(out, "<");
861
+ buffer_append_str(out, out_name);
862
+ emit_attributes(el, out_name, out);
863
+ buffer_append_str(out, ">");
864
+ emit_styles_open(out, es);
865
+ walk_children(node, out);
866
+ emit_styles_close(out, es);
867
+ buffer_append_str(out, "</");
868
+ buffer_append_str(out, out_name);
869
+ buffer_append_str(out, ">");
870
+ break;
871
+ }
872
+ }
873
+ }
874
+
875
+ /* ------------------------------------------------------------------ */
876
+ /* Find <body> element from parse output */
877
+ /* ------------------------------------------------------------------ */
878
+
879
+ static GumboNode *find_body(GumboNode *root) {
880
+ if (!root || !is_element(root))
881
+ return root;
882
+ GumboVector *children = &root->v.element.children;
883
+ for (unsigned int i = 0; i < children->length; i++) {
884
+ GumboNode *child = children->data[i];
885
+ if (is_element(child) && child->v.element.tag == GUMBO_TAG_BODY)
886
+ return child;
887
+ }
888
+ return root;
889
+ }
890
+
891
+ /* ------------------------------------------------------------------ */
892
+ /* Public API */
893
+ /* ------------------------------------------------------------------ */
894
+
895
+ char *normalize_html(const char *html, size_t len) {
896
+ if (!html || len == 0)
897
+ return NULL;
898
+
899
+ GumboOutput *output =
900
+ gumbo_parse_with_options(&kGumboDefaultOptions, html, len);
901
+ if (!output)
902
+ return NULL;
903
+
904
+ GumboNode *body = find_body(output->root);
905
+ if (!body)
906
+ body = output->root;
907
+
908
+ buffer_t buf = buffer_create(len * 2);
909
+ walk_children(body, &buf);
910
+
911
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
912
+ return buffer_finish(&buf);
913
+ }
914
+
915
+ void free_normalized_html(char *result) { free(result); }