@fendent/react-native-enriched 0.5.2-fork.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/README.md +343 -0
- package/ReactNativeEnriched.podspec +31 -0
- package/android/build.gradle +106 -0
- package/android/generated/java/com/facebook/react/viewmanagers/EnrichedTextInputViewManagerDelegate.java +197 -0
- package/android/generated/java/com/facebook/react/viewmanagers/EnrichedTextInputViewManagerInterface.java +72 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/ComponentDescriptors.cpp +22 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/ComponentDescriptors.h +24 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/EventEmitters.cpp +434 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/EventEmitters.h +391 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/Props.cpp +173 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/Props.h +833 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/ShadowNodes.cpp +17 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/ShadowNodes.h +23 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/States.cpp +16 -0
- package/android/generated/jni/react/renderer/components/ReactNativeEnrichedSpec/States.h +20 -0
- package/android/gradle.properties +5 -0
- package/android/lint.gradle +70 -0
- package/android/src/main/AndroidManifest.xml +3 -0
- package/android/src/main/AndroidManifestNew.xml +2 -0
- package/android/src/main/java/com/swmansion/enriched/ReactNativeEnrichedPackage.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/AsyncDrawable.kt +126 -0
- package/android/src/main/java/com/swmansion/enriched/common/CheckboxDrawable.kt +81 -0
- package/android/src/main/java/com/swmansion/enriched/common/EnrichedConstants.kt +11 -0
- package/android/src/main/java/com/swmansion/enriched/common/EnrichedStyle.kt +57 -0
- package/android/src/main/java/com/swmansion/enriched/common/ForceRedrawSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/common/GumboNormalizer.kt +5 -0
- package/android/src/main/java/com/swmansion/enriched/common/MentionStyle.kt +7 -0
- package/android/src/main/java/com/swmansion/enriched/common/ResourceManager.kt +26 -0
- package/android/src/main/java/com/swmansion/enriched/common/parser/EnrichedParser.java +956 -0
- package/android/src/main/java/com/swmansion/enriched/common/parser/EnrichedSpanFactory.kt +79 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedBlockQuoteSpan.kt +53 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedBoldSpan.kt +12 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedCheckboxListSpan.kt +92 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedCodeBlockSpan.kt +81 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH1Span.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH2Span.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH3Span.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH4Span.kt +21 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH5Span.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedH6Span.kt +20 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedImageSpan.kt +184 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedInlineCodeSpan.kt +24 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedItalicSpan.kt +12 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedLinkSpan.kt +29 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedMentionSpan.kt +35 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedOrderedListSpan.kt +79 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedStrikeThroughSpan.kt +11 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedUnderlineSpan.kt +11 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/EnrichedUnorderedListSpan.kt +62 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedBlockSpan.kt +5 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedHeadingSpan.kt +3 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedInlineSpan.kt +3 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedParagraphSpan.kt +5 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedSpan.kt +3 -0
- package/android/src/main/java/com/swmansion/enriched/common/spans/interfaces/EnrichedZeroWidthSpaceSpan.kt +4 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputConnectionWrapper.kt +140 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputSpannableFactory.kt +83 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputView.kt +1120 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputViewLayoutManager.kt +27 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/EnrichedTextInputViewManager.kt +478 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/MeasurementStore.kt +225 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/MentionHandler.kt +55 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnChangeHtmlEvent.kt +27 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnChangeSelectionEvent.kt +30 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnChangeStateEvent.kt +21 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnChangeTextEvent.kt +30 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnContextMenuItemPressEvent.kt +35 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnInputBlurEvent.kt +25 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnInputFocusEvent.kt +25 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnInputKeyPressEvent.kt +27 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnLinkDetectedEvent.kt +32 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnMentionDetectedEvent.kt +30 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnMentionEvent.kt +34 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnPasteImagesEvent.kt +47 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnRequestHtmlResultEvent.kt +32 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/events/OnSubmitEditingEvent.kt +29 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputBlockQuoteSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputBoldSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputCheckboxListSpan.kt +15 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputCodeBlockSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH1Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH2Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH3Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH4Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH5Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputH6Span.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputImageSpan.kt +36 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputInlineCodeSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputItalicSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputLinkSpan.kt +16 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputMentionSpan.kt +18 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputOrderedListSpan.kt +21 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputStrikeThroughSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputUnderlineSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedInputUnorderedListSpan.kt +14 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedLineHeightSpan.kt +44 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/EnrichedSpans.kt +241 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/spans/interfaces/EnrichedInputSpan.kt +10 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/styles/HtmlStyle.kt +372 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/styles/InlineStyles.kt +164 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/styles/ListStyles.kt +263 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/styles/ParagraphStyles.kt +434 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/styles/ParametrizedStyles.kt +394 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedEditableFactory.kt +17 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedSelection.kt +320 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedSpanState.kt +310 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedSpannable.kt +106 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/EnrichedSpannableStringBuilder.kt +24 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/RichContentReceiver.kt +127 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/utils/Utils.kt +106 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/watchers/EnrichedSpanWatcher.kt +107 -0
- package/android/src/main/java/com/swmansion/enriched/textinput/watchers/EnrichedTextWatcher.kt +74 -0
- package/android/src/main/new_arch/CMakeLists.txt +62 -0
- package/android/src/main/new_arch/GumboNormalizerJni.cpp +14 -0
- package/android/src/main/new_arch/ReactNativeEnrichedSpec.cpp +11 -0
- package/android/src/main/new_arch/ReactNativeEnrichedSpec.h +15 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputComponentDescriptor.h +35 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputMeasurementManager.cpp +53 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputMeasurementManager.h +25 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputShadowNode.cpp +35 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputShadowNode.h +53 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputState.cpp +9 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/EnrichedTextInputState.h +24 -0
- package/android/src/main/new_arch/react/renderer/components/ReactNativeEnrichedSpec/conversions.h +27 -0
- package/android/src/main/res/drawable/broken_image.xml +10 -0
- package/cpp/CMakeLists.txt +50 -0
- package/cpp/GumboParser/GumboParser.h +34043 -0
- package/cpp/README.md +59 -0
- package/cpp/parser/GumboNormalizer.c +915 -0
- package/cpp/parser/GumboParser.cpp +16 -0
- package/cpp/parser/GumboParser.hpp +23 -0
- package/cpp/tests/GumboParserTest.cpp +457 -0
- package/ios/EnrichedTextInputView.h +53 -0
- package/ios/EnrichedTextInputView.mm +2360 -0
- package/ios/EnrichedTextInputViewManager.mm +13 -0
- package/ios/attributesManager/AttributesManager.h +17 -0
- package/ios/attributesManager/AttributesManager.mm +195 -0
- package/ios/config/InputConfig.h +104 -0
- package/ios/config/InputConfig.mm +664 -0
- package/ios/extensions/ColorExtension.h +7 -0
- package/ios/extensions/ColorExtension.mm +38 -0
- package/ios/extensions/FontExtension.h +11 -0
- package/ios/extensions/FontExtension.mm +72 -0
- package/ios/extensions/ImageExtension.h +34 -0
- package/ios/extensions/ImageExtension.mm +165 -0
- package/ios/extensions/LayoutManagerExtension.h +6 -0
- package/ios/extensions/LayoutManagerExtension.mm +443 -0
- package/ios/extensions/StringExtension.h +15 -0
- package/ios/extensions/StringExtension.mm +69 -0
- package/ios/generated/ReactNativeEnrichedSpec/ComponentDescriptors.cpp +22 -0
- package/ios/generated/ReactNativeEnrichedSpec/ComponentDescriptors.h +24 -0
- package/ios/generated/ReactNativeEnrichedSpec/EventEmitters.cpp +434 -0
- package/ios/generated/ReactNativeEnrichedSpec/EventEmitters.h +391 -0
- package/ios/generated/ReactNativeEnrichedSpec/Props.cpp +173 -0
- package/ios/generated/ReactNativeEnrichedSpec/Props.h +833 -0
- package/ios/generated/ReactNativeEnrichedSpec/RCTComponentViewHelpers.h +582 -0
- package/ios/generated/ReactNativeEnrichedSpec/ShadowNodes.cpp +17 -0
- package/ios/generated/ReactNativeEnrichedSpec/ShadowNodes.h +23 -0
- package/ios/generated/ReactNativeEnrichedSpec/States.cpp +16 -0
- package/ios/generated/ReactNativeEnrichedSpec/States.h +20 -0
- package/ios/inputParser/InputParser.h +11 -0
- package/ios/inputParser/InputParser.mm +1463 -0
- package/ios/inputTextView/InputTextView.h +6 -0
- package/ios/inputTextView/InputTextView.mm +285 -0
- package/ios/interfaces/AttributeEntry.h +9 -0
- package/ios/interfaces/AttributeEntry.mm +4 -0
- package/ios/interfaces/BaseStyleProtocol.h +17 -0
- package/ios/interfaces/ImageAttachment.h +11 -0
- package/ios/interfaces/ImageAttachment.mm +107 -0
- package/ios/interfaces/ImageData.h +10 -0
- package/ios/interfaces/ImageData.mm +4 -0
- package/ios/interfaces/LinkData.h +11 -0
- package/ios/interfaces/LinkData.mm +29 -0
- package/ios/interfaces/LinkRegexConfig.h +19 -0
- package/ios/interfaces/LinkRegexConfig.mm +37 -0
- package/ios/interfaces/MediaAttachment.h +23 -0
- package/ios/interfaces/MediaAttachment.mm +31 -0
- package/ios/interfaces/MentionParams.h +8 -0
- package/ios/interfaces/MentionParams.mm +4 -0
- package/ios/interfaces/MentionStyleProps.h +13 -0
- package/ios/interfaces/MentionStyleProps.mm +63 -0
- package/ios/interfaces/StyleBase.h +36 -0
- package/ios/interfaces/StyleBase.mm +256 -0
- package/ios/interfaces/StyleHeaders.h +102 -0
- package/ios/interfaces/StylePair.h +9 -0
- package/ios/interfaces/StylePair.mm +4 -0
- package/ios/interfaces/StyleTypeEnum.h +26 -0
- package/ios/interfaces/TextDecorationLineEnum.h +6 -0
- package/ios/interfaces/TextDecorationLineEnum.mm +4 -0
- package/ios/internals/EnrichedTextInputViewComponentDescriptor.h +19 -0
- package/ios/internals/EnrichedTextInputViewShadowNode.h +44 -0
- package/ios/internals/EnrichedTextInputViewShadowNode.mm +103 -0
- package/ios/internals/EnrichedTextInputViewState.cpp +10 -0
- package/ios/internals/EnrichedTextInputViewState.h +22 -0
- package/ios/styles/BlockQuoteStyle.mm +55 -0
- package/ios/styles/BoldStyle.mm +37 -0
- package/ios/styles/CheckboxListStyle.mm +153 -0
- package/ios/styles/CodeBlockStyle.mm +49 -0
- package/ios/styles/H1Style.mm +20 -0
- package/ios/styles/H2Style.mm +20 -0
- package/ios/styles/H3Style.mm +20 -0
- package/ios/styles/H4Style.mm +20 -0
- package/ios/styles/H5Style.mm +20 -0
- package/ios/styles/H6Style.mm +20 -0
- package/ios/styles/HeadingStyleBase.mm +65 -0
- package/ios/styles/ImageStyle.mm +146 -0
- package/ios/styles/InlineCodeStyle.mm +65 -0
- package/ios/styles/ItalicStyle.mm +37 -0
- package/ios/styles/LinkStyle.mm +532 -0
- package/ios/styles/MentionStyle.mm +538 -0
- package/ios/styles/OrderedListStyle.mm +86 -0
- package/ios/styles/StrikethroughStyle.mm +25 -0
- package/ios/styles/UnderlineStyle.mm +24 -0
- package/ios/styles/UnorderedListStyle.mm +86 -0
- package/ios/utils/CheckboxHitTestUtils.h +10 -0
- package/ios/utils/CheckboxHitTestUtils.mm +122 -0
- package/ios/utils/DotReplacementUtils.h +10 -0
- package/ios/utils/DotReplacementUtils.mm +68 -0
- package/ios/utils/KeyboardUtils.h +7 -0
- package/ios/utils/KeyboardUtils.mm +31 -0
- package/ios/utils/OccurenceUtils.h +44 -0
- package/ios/utils/OccurenceUtils.mm +179 -0
- package/ios/utils/ParagraphAttributesUtils.h +15 -0
- package/ios/utils/ParagraphAttributesUtils.mm +257 -0
- package/ios/utils/RangeUtils.h +12 -0
- package/ios/utils/RangeUtils.mm +183 -0
- package/ios/utils/TextBlockTapGestureRecognizer.h +17 -0
- package/ios/utils/TextBlockTapGestureRecognizer.mm +56 -0
- package/ios/utils/TextInsertionUtils.h +17 -0
- package/ios/utils/TextInsertionUtils.mm +64 -0
- package/ios/utils/WordsUtils.h +7 -0
- package/ios/utils/WordsUtils.mm +98 -0
- package/ios/utils/ZeroWidthSpaceUtils.h +9 -0
- package/ios/utils/ZeroWidthSpaceUtils.mm +270 -0
- package/lib/module/index.js +4 -0
- package/lib/module/index.js.map +1 -0
- package/lib/module/native/EnrichedTextInput.js +304 -0
- package/lib/module/native/EnrichedTextInput.js.map +1 -0
- package/lib/module/package.json +1 -0
- package/lib/module/spec/EnrichedTextInputNativeComponent.ts +517 -0
- package/lib/module/types.js +4 -0
- package/lib/module/types.js.map +1 -0
- package/lib/module/utils/EnrichedTextInputDefaultProps.js +12 -0
- package/lib/module/utils/EnrichedTextInputDefaultProps.js.map +1 -0
- package/lib/module/utils/normalizeHtmlStyle.js +155 -0
- package/lib/module/utils/normalizeHtmlStyle.js.map +1 -0
- package/lib/module/utils/nullthrows.js +9 -0
- package/lib/module/utils/nullthrows.js.map +1 -0
- package/lib/module/utils/regexParser.js +46 -0
- package/lib/module/utils/regexParser.js.map +1 -0
- package/lib/typescript/package.json +1 -0
- package/lib/typescript/src/index.d.ts +3 -0
- package/lib/typescript/src/index.d.ts.map +1 -0
- package/lib/typescript/src/native/EnrichedTextInput.d.ts +3 -0
- package/lib/typescript/src/native/EnrichedTextInput.d.ts.map +1 -0
- package/lib/typescript/src/spec/EnrichedTextInputNativeComponent.d.ts +397 -0
- package/lib/typescript/src/spec/EnrichedTextInputNativeComponent.d.ts.map +1 -0
- package/lib/typescript/src/types.d.ts +447 -0
- package/lib/typescript/src/types.d.ts.map +1 -0
- package/lib/typescript/src/utils/EnrichedTextInputDefaultProps.d.ts +10 -0
- package/lib/typescript/src/utils/EnrichedTextInputDefaultProps.d.ts.map +1 -0
- package/lib/typescript/src/utils/normalizeHtmlStyle.d.ts +4 -0
- package/lib/typescript/src/utils/normalizeHtmlStyle.d.ts.map +1 -0
- package/lib/typescript/src/utils/nullthrows.d.ts +2 -0
- package/lib/typescript/src/utils/nullthrows.d.ts.map +1 -0
- package/lib/typescript/src/utils/regexParser.d.ts +3 -0
- package/lib/typescript/src/utils/regexParser.d.ts.map +1 -0
- package/package.json +226 -0
- package/react-native.config.js +13 -0
- package/src/index.tsx +20 -0
- package/src/native/EnrichedTextInput.tsx +370 -0
- package/src/spec/EnrichedTextInputNativeComponent.ts +517 -0
- package/src/types.ts +499 -0
- package/src/utils/EnrichedTextInputDefaultProps.ts +9 -0
- package/src/utils/normalizeHtmlStyle.ts +199 -0
- package/src/utils/nullthrows.ts +7 -0
- package/src/utils/regexParser.ts +56 -0
|
@@ -0,0 +1,915 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GumboNormalizer.c
|
|
3
|
+
*
|
|
4
|
+
* Gumbo-based HTML normalizer (C implementation).
|
|
5
|
+
* Converts arbitrary external HTML into the canonical subset that our enriched
|
|
6
|
+
* parser understands.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#define GUMBO_IMPLEMENTATION
|
|
10
|
+
|
|
11
|
+
#ifdef __clang__
|
|
12
|
+
#pragma clang diagnostic push
|
|
13
|
+
#pragma clang diagnostic ignored "-Weverything"
|
|
14
|
+
#elif defined(__GNUC__)
|
|
15
|
+
#pragma GCC diagnostic push
|
|
16
|
+
#pragma GCC diagnostic ignored "-Wall"
|
|
17
|
+
#pragma GCC diagnostic ignored "-Wextra"
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
#include "GumboParser.h"
|
|
21
|
+
|
|
22
|
+
#ifdef __clang__
|
|
23
|
+
#pragma clang diagnostic pop
|
|
24
|
+
#elif defined(__GNUC__)
|
|
25
|
+
#pragma GCC diagnostic pop
|
|
26
|
+
#endif
|
|
27
|
+
|
|
28
|
+
#include <ctype.h>
|
|
29
|
+
#include <stdlib.h>
|
|
30
|
+
#include <string.h>
|
|
31
|
+
|
|
32
|
+
/* ------------------------------------------------------------------ */
|
|
33
|
+
/* Dynamic string buffer */
|
|
34
|
+
/* ------------------------------------------------------------------ */
|
|
35
|
+
|
|
36
|
+
typedef struct {
|
|
37
|
+
char *data;
|
|
38
|
+
size_t len;
|
|
39
|
+
size_t cap;
|
|
40
|
+
} buffer_t;
|
|
41
|
+
|
|
42
|
+
static buffer_t buffer_create(size_t initial_cap) {
|
|
43
|
+
buffer_t b;
|
|
44
|
+
b.cap = initial_cap > 64 ? initial_cap : 64;
|
|
45
|
+
b.data = (char *)malloc(b.cap);
|
|
46
|
+
b.len = 0;
|
|
47
|
+
if (b.data)
|
|
48
|
+
b.data[0] = '\0';
|
|
49
|
+
return b;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static void buffer_ensure(buffer_t *b, size_t extra) {
|
|
53
|
+
if (b->len + extra + 1 > b->cap) {
|
|
54
|
+
while (b->len + extra + 1 > b->cap)
|
|
55
|
+
b->cap *= 2;
|
|
56
|
+
b->data = (char *)realloc(b->data, b->cap);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
static void buffer_append(buffer_t *b, const char *s, size_t n) {
|
|
61
|
+
if (!s || n == 0)
|
|
62
|
+
return;
|
|
63
|
+
buffer_ensure(b, n);
|
|
64
|
+
memcpy(b->data + b->len, s, n);
|
|
65
|
+
b->len += n;
|
|
66
|
+
b->data[b->len] = '\0';
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static void buffer_append_str(buffer_t *b, const char *s) {
|
|
70
|
+
if (!s)
|
|
71
|
+
return;
|
|
72
|
+
buffer_append(b, s, strlen(s));
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
static void buffer_clear(buffer_t *b) {
|
|
76
|
+
b->len = 0;
|
|
77
|
+
b->data[0] = '\0';
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
static char *buffer_finish(buffer_t *b) { return b->data; /* caller owns */ }
|
|
81
|
+
|
|
82
|
+
/* ------------------------------------------------------------------ */
|
|
83
|
+
/* Tag classification helpers */
|
|
84
|
+
/* ------------------------------------------------------------------ */
|
|
85
|
+
|
|
86
|
+
typedef enum {
|
|
87
|
+
TAG_CLASS_SKIP, /* tag stripped, children processed */
|
|
88
|
+
TAG_CLASS_INLINE, /* canonical inline tag */
|
|
89
|
+
TAG_CLASS_BLOCK, /* canonical block tag */
|
|
90
|
+
TAG_CLASS_SELF_CLOSING, /* e.g. <br>, <img> */
|
|
91
|
+
TAG_CLASS_PASS, /* pass-through (e.g. <html>, <body>) */
|
|
92
|
+
} tag_class_t;
|
|
93
|
+
|
|
94
|
+
static const char *canonical_name(const char *name) {
|
|
95
|
+
if (strcmp(name, "strong") == 0)
|
|
96
|
+
return "b";
|
|
97
|
+
if (strcmp(name, "em") == 0)
|
|
98
|
+
return "i";
|
|
99
|
+
if (strcmp(name, "del") == 0 || strcmp(name, "strike") == 0)
|
|
100
|
+
return "s";
|
|
101
|
+
if (strcmp(name, "ins") == 0)
|
|
102
|
+
return "u";
|
|
103
|
+
if (strcmp(name, "pre") == 0)
|
|
104
|
+
return "codeblock";
|
|
105
|
+
return NULL;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
static tag_class_t classify_tag(const char *name) {
|
|
109
|
+
if (strcmp(name, "b") == 0 || strcmp(name, "i") == 0 ||
|
|
110
|
+
strcmp(name, "u") == 0 || strcmp(name, "s") == 0 ||
|
|
111
|
+
strcmp(name, "code") == 0 || strcmp(name, "a") == 0 ||
|
|
112
|
+
strcmp(name, "strong") == 0 || strcmp(name, "em") == 0 ||
|
|
113
|
+
strcmp(name, "del") == 0 || strcmp(name, "strike") == 0 ||
|
|
114
|
+
strcmp(name, "ins") == 0 || strcmp(name, "mention") == 0)
|
|
115
|
+
return TAG_CLASS_INLINE;
|
|
116
|
+
|
|
117
|
+
if (strcmp(name, "p") == 0 || strcmp(name, "h1") == 0 ||
|
|
118
|
+
strcmp(name, "h2") == 0 || strcmp(name, "h3") == 0 ||
|
|
119
|
+
strcmp(name, "h4") == 0 || strcmp(name, "h5") == 0 ||
|
|
120
|
+
strcmp(name, "h6") == 0 || strcmp(name, "ul") == 0 ||
|
|
121
|
+
strcmp(name, "ol") == 0 || strcmp(name, "li") == 0 ||
|
|
122
|
+
strcmp(name, "blockquote") == 0 || strcmp(name, "codeblock") == 0 ||
|
|
123
|
+
strcmp(name, "pre") == 0)
|
|
124
|
+
return TAG_CLASS_BLOCK;
|
|
125
|
+
|
|
126
|
+
if (strcmp(name, "br") == 0 || strcmp(name, "img") == 0)
|
|
127
|
+
return TAG_CLASS_SELF_CLOSING;
|
|
128
|
+
|
|
129
|
+
if (strcmp(name, "html") == 0 || strcmp(name, "head") == 0 ||
|
|
130
|
+
strcmp(name, "body") == 0)
|
|
131
|
+
return TAG_CLASS_PASS;
|
|
132
|
+
|
|
133
|
+
return TAG_CLASS_SKIP;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/* ------------------------------------------------------------------ */
|
|
137
|
+
/* DOM helpers — get tag name, node type checks */
|
|
138
|
+
/* ------------------------------------------------------------------ */
|
|
139
|
+
|
|
140
|
+
static bool is_element(GumboNode *node) {
|
|
141
|
+
return node && (node->type == GUMBO_NODE_ELEMENT ||
|
|
142
|
+
node->type == GUMBO_NODE_TEMPLATE);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
static bool is_text(GumboNode *node) {
|
|
146
|
+
return node &&
|
|
147
|
+
(node->type == GUMBO_NODE_TEXT || node->type == GUMBO_NODE_WHITESPACE);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Get the lowercased tag name of an element node into buf. */
|
|
151
|
+
static const char *get_tag_name(GumboNode *node, char *buf, size_t buf_sz) {
|
|
152
|
+
if (!is_element(node))
|
|
153
|
+
return NULL;
|
|
154
|
+
GumboElement *el = &node->v.element;
|
|
155
|
+
|
|
156
|
+
if (el->tag != GUMBO_TAG_UNKNOWN) {
|
|
157
|
+
const char *name = gumbo_normalized_tagname(el->tag);
|
|
158
|
+
if (name && name[0]) {
|
|
159
|
+
size_t n = strlen(name);
|
|
160
|
+
if (n >= buf_sz)
|
|
161
|
+
n = buf_sz - 1;
|
|
162
|
+
memcpy(buf, name, n);
|
|
163
|
+
buf[n] = '\0';
|
|
164
|
+
return buf;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/* Unknown tag — extract from original_tag */
|
|
169
|
+
GumboStringPiece piece = el->original_tag;
|
|
170
|
+
gumbo_tag_from_original_text(&piece);
|
|
171
|
+
if (piece.data && piece.length > 0) {
|
|
172
|
+
size_t n = piece.length < buf_sz - 1 ? piece.length : buf_sz - 1;
|
|
173
|
+
for (size_t i = 0; i < n; i++)
|
|
174
|
+
buf[i] = (char)tolower((unsigned char)piece.data[i]);
|
|
175
|
+
buf[n] = '\0';
|
|
176
|
+
return buf;
|
|
177
|
+
}
|
|
178
|
+
return NULL;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
static bool is_list_node(GumboNode *node) {
|
|
182
|
+
char buf[64];
|
|
183
|
+
const char *n = get_tag_name(node, buf, sizeof(buf));
|
|
184
|
+
return n && (strcmp(n, "ul") == 0 || strcmp(n, "ol") == 0);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
static bool is_blockquote_node(GumboNode *node) {
|
|
188
|
+
char buf[64];
|
|
189
|
+
const char *n = get_tag_name(node, buf, sizeof(buf));
|
|
190
|
+
return n && strcmp(n, "blockquote") == 0;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
static bool is_br_node(GumboNode *node) {
|
|
194
|
+
char buf[64];
|
|
195
|
+
const char *n = get_tag_name(node, buf, sizeof(buf));
|
|
196
|
+
return n && strcmp(n, "br") == 0;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
static bool is_block_producing(GumboNode *node) {
|
|
200
|
+
char buf[64];
|
|
201
|
+
const char *n = get_tag_name(node, buf, sizeof(buf));
|
|
202
|
+
if (!n)
|
|
203
|
+
return false;
|
|
204
|
+
if (classify_tag(n) == TAG_CLASS_BLOCK)
|
|
205
|
+
return true;
|
|
206
|
+
return strcmp(n, "div") == 0 || strcmp(n, "table") == 0 ||
|
|
207
|
+
strcmp(n, "tr") == 0;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/** True if all children are inline/text (no block-producing elements). */
|
|
211
|
+
static bool is_purely_inline(GumboNode *node) {
|
|
212
|
+
if (!is_element(node))
|
|
213
|
+
return true;
|
|
214
|
+
GumboVector *children = &node->v.element.children;
|
|
215
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
|
216
|
+
if (is_block_producing(children->data[i]))
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
return true;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/** True if any direct child is block-producing or a blockquote. */
|
|
223
|
+
static bool has_block_or_bq_child(GumboNode *node) {
|
|
224
|
+
if (!is_element(node))
|
|
225
|
+
return false;
|
|
226
|
+
GumboVector *children = &node->v.element.children;
|
|
227
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
|
228
|
+
GumboNode *c = children->data[i];
|
|
229
|
+
if (is_block_producing(c) || is_blockquote_node(c))
|
|
230
|
+
return true;
|
|
231
|
+
}
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/* ------------------------------------------------------------------ */
|
|
236
|
+
/* CSS style → canonical tag mapping (simple string matching) */
|
|
237
|
+
/* ------------------------------------------------------------------ */
|
|
238
|
+
|
|
239
|
+
typedef struct {
|
|
240
|
+
bool bold;
|
|
241
|
+
bool italic;
|
|
242
|
+
bool underline;
|
|
243
|
+
bool strikethrough;
|
|
244
|
+
} css_styles_t;
|
|
245
|
+
|
|
246
|
+
static const char *find_css_value(const char *style, size_t style_len,
|
|
247
|
+
const char *prop_name, size_t *val_len) {
|
|
248
|
+
size_t plen = strlen(prop_name);
|
|
249
|
+
const char *end = style + style_len;
|
|
250
|
+
const char *p = style;
|
|
251
|
+
while (p < end) {
|
|
252
|
+
while (p < end &&
|
|
253
|
+
(*p == ' ' || *p == '\t' || *p == ';' || *p == '\n' || *p == '\r'))
|
|
254
|
+
p++;
|
|
255
|
+
if (p >= end)
|
|
256
|
+
break;
|
|
257
|
+
if ((size_t)(end - p) > plen && strncmp(p, prop_name, plen) == 0) {
|
|
258
|
+
const char *after = p + plen;
|
|
259
|
+
while (after < end && (*after == ' ' || *after == '\t'))
|
|
260
|
+
after++;
|
|
261
|
+
if (after < end && *after == ':') {
|
|
262
|
+
after++;
|
|
263
|
+
while (after < end && (*after == ' ' || *after == '\t'))
|
|
264
|
+
after++;
|
|
265
|
+
const char *val_start = after;
|
|
266
|
+
while (after < end && *after != ';')
|
|
267
|
+
after++;
|
|
268
|
+
const char *val_end = after;
|
|
269
|
+
while (val_end > val_start &&
|
|
270
|
+
(*(val_end - 1) == ' ' || *(val_end - 1) == '\t'))
|
|
271
|
+
val_end--;
|
|
272
|
+
*val_len = (size_t)(val_end - val_start);
|
|
273
|
+
return val_start;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
while (p < end && *p != ';')
|
|
277
|
+
p++;
|
|
278
|
+
if (p < end)
|
|
279
|
+
p++;
|
|
280
|
+
}
|
|
281
|
+
return NULL;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
static bool css_val_contains(const char *val, size_t val_len,
|
|
285
|
+
const char *needle) {
|
|
286
|
+
size_t nlen = strlen(needle);
|
|
287
|
+
if (nlen > val_len)
|
|
288
|
+
return false;
|
|
289
|
+
for (size_t i = 0; i <= val_len - nlen; i++) {
|
|
290
|
+
bool match = true;
|
|
291
|
+
for (size_t j = 0; j < nlen; j++) {
|
|
292
|
+
if (tolower((unsigned char)val[i + j]) !=
|
|
293
|
+
tolower((unsigned char)needle[j])) {
|
|
294
|
+
match = false;
|
|
295
|
+
break;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
if (match)
|
|
299
|
+
return true;
|
|
300
|
+
}
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
static css_styles_t parse_css_style(const char *style_value, size_t style_len) {
|
|
305
|
+
css_styles_t result = {false, false, false, false};
|
|
306
|
+
if (!style_value || style_len == 0)
|
|
307
|
+
return result;
|
|
308
|
+
|
|
309
|
+
size_t vlen;
|
|
310
|
+
const char *val;
|
|
311
|
+
|
|
312
|
+
val = find_css_value(style_value, style_len, "font-weight", &vlen);
|
|
313
|
+
if (val) {
|
|
314
|
+
if (css_val_contains(val, vlen, "bold") ||
|
|
315
|
+
css_val_contains(val, vlen, "bolder")) {
|
|
316
|
+
result.bold = true;
|
|
317
|
+
} else {
|
|
318
|
+
int num = atoi(val);
|
|
319
|
+
if (num >= 700)
|
|
320
|
+
result.bold = true;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
val = find_css_value(style_value, style_len, "font-style", &vlen);
|
|
325
|
+
if (val) {
|
|
326
|
+
if (css_val_contains(val, vlen, "italic") ||
|
|
327
|
+
css_val_contains(val, vlen, "oblique"))
|
|
328
|
+
result.italic = true;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
{
|
|
332
|
+
const char *search_start = style_value;
|
|
333
|
+
size_t search_remaining = style_len;
|
|
334
|
+
while (search_remaining > 0) {
|
|
335
|
+
val = find_css_value(search_start, search_remaining,
|
|
336
|
+
"text-decoration-line", &vlen);
|
|
337
|
+
if (!val)
|
|
338
|
+
val = find_css_value(search_start, search_remaining, "text-decoration",
|
|
339
|
+
&vlen);
|
|
340
|
+
if (!val)
|
|
341
|
+
break;
|
|
342
|
+
if (css_val_contains(val, vlen, "underline"))
|
|
343
|
+
result.underline = true;
|
|
344
|
+
if (css_val_contains(val, vlen, "line-through"))
|
|
345
|
+
result.strikethrough = true;
|
|
346
|
+
size_t consumed = (size_t)(val + vlen - search_start);
|
|
347
|
+
if (consumed >= search_remaining)
|
|
348
|
+
break;
|
|
349
|
+
search_start = val + vlen;
|
|
350
|
+
search_remaining = style_len - (size_t)(search_start - style_value);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
return result;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
static css_styles_t extra_styles(css_styles_t s, const char *tag) {
|
|
358
|
+
if (strcmp(tag, "b") == 0)
|
|
359
|
+
s.bold = false;
|
|
360
|
+
if (strcmp(tag, "i") == 0)
|
|
361
|
+
s.italic = false;
|
|
362
|
+
if (strcmp(tag, "u") == 0)
|
|
363
|
+
s.underline = false;
|
|
364
|
+
if (strcmp(tag, "s") == 0)
|
|
365
|
+
s.strikethrough = false;
|
|
366
|
+
return s;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
static void emit_styles_open(buffer_t *out, css_styles_t s) {
|
|
370
|
+
if (s.bold)
|
|
371
|
+
buffer_append_str(out, "<b>");
|
|
372
|
+
if (s.italic)
|
|
373
|
+
buffer_append_str(out, "<i>");
|
|
374
|
+
if (s.underline)
|
|
375
|
+
buffer_append_str(out, "<u>");
|
|
376
|
+
if (s.strikethrough)
|
|
377
|
+
buffer_append_str(out, "<s>");
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
static void emit_styles_close(buffer_t *out, css_styles_t s) {
|
|
381
|
+
if (s.strikethrough)
|
|
382
|
+
buffer_append_str(out, "</s>");
|
|
383
|
+
if (s.underline)
|
|
384
|
+
buffer_append_str(out, "</u>");
|
|
385
|
+
if (s.italic)
|
|
386
|
+
buffer_append_str(out, "</i>");
|
|
387
|
+
if (s.bold)
|
|
388
|
+
buffer_append_str(out, "</b>");
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/* ------------------------------------------------------------------ */
|
|
392
|
+
/* Attribute emission helpers */
|
|
393
|
+
/* ------------------------------------------------------------------ */
|
|
394
|
+
|
|
395
|
+
static const char *get_attr(GumboElement *el, const char *name) {
|
|
396
|
+
GumboAttribute *attr = gumbo_get_attribute(&el->attributes, name);
|
|
397
|
+
if (attr)
|
|
398
|
+
return attr->value;
|
|
399
|
+
return NULL;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
static void emit_one_attr(buffer_t *out, GumboElement *el,
|
|
403
|
+
const char *attr_name) {
|
|
404
|
+
const char *val = get_attr(el, attr_name);
|
|
405
|
+
if (val && val[0]) {
|
|
406
|
+
buffer_append_str(out, " ");
|
|
407
|
+
buffer_append_str(out, attr_name);
|
|
408
|
+
buffer_append_str(out, "=\"");
|
|
409
|
+
buffer_append_str(out, val);
|
|
410
|
+
buffer_append_str(out, "\"");
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
static void emit_attributes(GumboElement *el, const char *tag_name,
|
|
415
|
+
buffer_t *out) {
|
|
416
|
+
if (strcmp(tag_name, "a") == 0) {
|
|
417
|
+
emit_one_attr(out, el, "href");
|
|
418
|
+
} else if (strcmp(tag_name, "img") == 0) {
|
|
419
|
+
emit_one_attr(out, el, "src");
|
|
420
|
+
emit_one_attr(out, el, "alt");
|
|
421
|
+
emit_one_attr(out, el, "width");
|
|
422
|
+
emit_one_attr(out, el, "height");
|
|
423
|
+
} else if (strcmp(tag_name, "ul") == 0) {
|
|
424
|
+
const char *val = get_attr(el, "data-type");
|
|
425
|
+
if (val && strcmp(val, "checkbox") == 0)
|
|
426
|
+
buffer_append_str(out, " data-type=\"checkbox\"");
|
|
427
|
+
} else if (strcmp(tag_name, "li") == 0) {
|
|
428
|
+
if (gumbo_get_attribute(&el->attributes, "checked") != NULL)
|
|
429
|
+
buffer_append_str(out, " checked");
|
|
430
|
+
} else if (strcmp(tag_name, "mention") == 0) {
|
|
431
|
+
emit_one_attr(out, el, "id");
|
|
432
|
+
emit_one_attr(out, el, "text");
|
|
433
|
+
emit_one_attr(out, el, "indicator");
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/* ------------------------------------------------------------------ */
|
|
438
|
+
/* Google Docs specific handling */
|
|
439
|
+
/* ------------------------------------------------------------------ */
|
|
440
|
+
|
|
441
|
+
static bool is_google_docs_wrapper(GumboElement *el, const char *tag_name) {
|
|
442
|
+
if (strcmp(tag_name, "b") != 0)
|
|
443
|
+
return false;
|
|
444
|
+
const char *id_val = get_attr(el, "id");
|
|
445
|
+
if (!id_val)
|
|
446
|
+
return false;
|
|
447
|
+
size_t id_len = strlen(id_val);
|
|
448
|
+
return (id_len > 20 && strncmp(id_val, "docs-internal-guid-", 19) == 0);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/* ------------------------------------------------------------------ */
|
|
452
|
+
/* Recursive DOM tree walker */
|
|
453
|
+
/* ------------------------------------------------------------------ */
|
|
454
|
+
|
|
455
|
+
static void walk_node(GumboNode *node, buffer_t *out);
|
|
456
|
+
|
|
457
|
+
/* ------------------------------------------------------------------ */
|
|
458
|
+
/* Blockquote content flattening */
|
|
459
|
+
/* ------------------------------------------------------------------ */
|
|
460
|
+
|
|
461
|
+
static void flatten_bq_node(GumboNode *node, buffer_t *ib, buffer_t *out);
|
|
462
|
+
|
|
463
|
+
static void flush_inline_p(buffer_t *ib, buffer_t *out) {
|
|
464
|
+
if (ib->len > 0) {
|
|
465
|
+
buffer_append_str(out, "<p>");
|
|
466
|
+
buffer_append(out, ib->data, ib->len);
|
|
467
|
+
buffer_append_str(out, "</p>");
|
|
468
|
+
buffer_clear(ib);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
static void flatten_bq_children(GumboNode *node, buffer_t *ib, buffer_t *out) {
|
|
473
|
+
if (!is_element(node))
|
|
474
|
+
return;
|
|
475
|
+
GumboVector *children = &node->v.element.children;
|
|
476
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
|
477
|
+
flatten_bq_node(children->data[i], ib, out);
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
static void flatten_bq_node(GumboNode *node, buffer_t *ib, buffer_t *out) {
|
|
482
|
+
if (!node)
|
|
483
|
+
return;
|
|
484
|
+
if (is_text(node)) {
|
|
485
|
+
walk_node(node, ib);
|
|
486
|
+
return;
|
|
487
|
+
}
|
|
488
|
+
if (!is_element(node)) {
|
|
489
|
+
return;
|
|
490
|
+
}
|
|
491
|
+
if (is_br_node(node)) {
|
|
492
|
+
flush_inline_p(ib, out);
|
|
493
|
+
return;
|
|
494
|
+
}
|
|
495
|
+
if (is_block_producing(node) || is_blockquote_node(node)) {
|
|
496
|
+
flush_inline_p(ib, out);
|
|
497
|
+
flatten_bq_children(node, ib, out);
|
|
498
|
+
flush_inline_p(ib, out);
|
|
499
|
+
return;
|
|
500
|
+
}
|
|
501
|
+
walk_node(node, ib);
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/* ------------------------------------------------------------------ */
|
|
505
|
+
/* List item content flattening */
|
|
506
|
+
/* ------------------------------------------------------------------ */
|
|
507
|
+
|
|
508
|
+
typedef struct {
|
|
509
|
+
GumboElement *el;
|
|
510
|
+
css_styles_t styles;
|
|
511
|
+
GumboNode **nested_lists;
|
|
512
|
+
int *nested_count;
|
|
513
|
+
int max_nested;
|
|
514
|
+
} li_ctx_t;
|
|
515
|
+
|
|
516
|
+
static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
|
|
517
|
+
li_ctx_t *ctx);
|
|
518
|
+
|
|
519
|
+
static void flush_li_buffer(buffer_t *ib, buffer_t *out, li_ctx_t *ctx) {
|
|
520
|
+
if (ib->len == 0)
|
|
521
|
+
return;
|
|
522
|
+
buffer_append_str(out, "<li");
|
|
523
|
+
emit_attributes(ctx->el, "li", out);
|
|
524
|
+
buffer_append_str(out, ">");
|
|
525
|
+
emit_styles_open(out, ctx->styles);
|
|
526
|
+
buffer_append(out, ib->data, ib->len);
|
|
527
|
+
emit_styles_close(out, ctx->styles);
|
|
528
|
+
buffer_append_str(out, "</li>");
|
|
529
|
+
buffer_clear(ib);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
static void flatten_li_children(GumboNode *node, buffer_t *ib, buffer_t *out,
|
|
533
|
+
li_ctx_t *ctx) {
|
|
534
|
+
if (!is_element(node))
|
|
535
|
+
return;
|
|
536
|
+
GumboVector *children = &node->v.element.children;
|
|
537
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
|
538
|
+
flatten_li_node(children->data[i], ib, out, ctx);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
static void flatten_li_node(GumboNode *node, buffer_t *ib, buffer_t *out,
|
|
543
|
+
li_ctx_t *ctx) {
|
|
544
|
+
if (!node)
|
|
545
|
+
return;
|
|
546
|
+
if (is_text(node)) {
|
|
547
|
+
walk_node(node, ib);
|
|
548
|
+
return;
|
|
549
|
+
}
|
|
550
|
+
if (!is_element(node)) {
|
|
551
|
+
flatten_li_children(node, ib, out, ctx);
|
|
552
|
+
return;
|
|
553
|
+
}
|
|
554
|
+
if (is_list_node(node)) {
|
|
555
|
+
if (*ctx->nested_count < ctx->max_nested) {
|
|
556
|
+
ctx->nested_lists[*ctx->nested_count] = node;
|
|
557
|
+
(*ctx->nested_count)++;
|
|
558
|
+
}
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
if (is_br_node(node)) {
|
|
562
|
+
flush_li_buffer(ib, out, ctx);
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
if (is_block_producing(node) || is_blockquote_node(node)) {
|
|
566
|
+
flush_li_buffer(ib, out, ctx);
|
|
567
|
+
flatten_li_children(node, ib, out, ctx);
|
|
568
|
+
flush_li_buffer(ib, out, ctx);
|
|
569
|
+
return;
|
|
570
|
+
}
|
|
571
|
+
walk_node(node, ib);
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
/* ------------------------------------------------------------------ */
|
|
575
|
+
/* walk_children — the main child-iteration driver */
|
|
576
|
+
/* ------------------------------------------------------------------ */
|
|
577
|
+
|
|
578
|
+
static void walk_children(GumboNode *node, buffer_t *out) {
|
|
579
|
+
if (!is_element(node))
|
|
580
|
+
return;
|
|
581
|
+
|
|
582
|
+
GumboVector *children = &node->v.element.children;
|
|
583
|
+
bool parent_is_list = is_list_node(node);
|
|
584
|
+
|
|
585
|
+
/* Detect mixed content: does the parent have any block-producing child? */
|
|
586
|
+
bool has_block = false;
|
|
587
|
+
for (unsigned int j = 0; j < children->length; j++) {
|
|
588
|
+
if (is_block_producing(children->data[j])) {
|
|
589
|
+
has_block = true;
|
|
590
|
+
break;
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
unsigned int i = 0;
|
|
595
|
+
while (i < children->length) {
|
|
596
|
+
GumboNode *child = children->data[i];
|
|
597
|
+
|
|
598
|
+
/* Flatten list-inside-list */
|
|
599
|
+
if (parent_is_list && is_list_node(child)) {
|
|
600
|
+
walk_children(child, out);
|
|
601
|
+
i++;
|
|
602
|
+
continue;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/* Merge consecutive blockquotes, flattening content into <p>s */
|
|
606
|
+
if (is_blockquote_node(child)) {
|
|
607
|
+
buffer_append_str(out, "<blockquote>");
|
|
608
|
+
buffer_t bq_ib = buffer_create(64);
|
|
609
|
+
while (i < children->length && is_blockquote_node(children->data[i])) {
|
|
610
|
+
flatten_bq_children(children->data[i], &bq_ib, out);
|
|
611
|
+
i++;
|
|
612
|
+
}
|
|
613
|
+
flush_inline_p(&bq_ib, out);
|
|
614
|
+
free(bq_ib.data);
|
|
615
|
+
buffer_append_str(out, "</blockquote>");
|
|
616
|
+
continue;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/* Auto-paragraph: group inline runs into <p> when mixed with blocks */
|
|
620
|
+
if (has_block && !parent_is_list && !is_block_producing(child) &&
|
|
621
|
+
!is_blockquote_node(child)) {
|
|
622
|
+
buffer_t ib = buffer_create(64);
|
|
623
|
+
while (i < children->length && !is_block_producing(children->data[i]) &&
|
|
624
|
+
!is_blockquote_node(children->data[i])) {
|
|
625
|
+
child = children->data[i];
|
|
626
|
+
if (is_br_node(child)) {
|
|
627
|
+
if (ib.len > 0)
|
|
628
|
+
flush_inline_p(&ib, out);
|
|
629
|
+
else
|
|
630
|
+
buffer_append_str(out, "<br>");
|
|
631
|
+
i++;
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
/* Transparent inline wrapper for block/bq children */
|
|
635
|
+
if (is_element(child) && has_block_or_bq_child(child)) {
|
|
636
|
+
flush_inline_p(&ib, out);
|
|
637
|
+
walk_children(child, out);
|
|
638
|
+
i++;
|
|
639
|
+
continue;
|
|
640
|
+
}
|
|
641
|
+
walk_node(child, &ib);
|
|
642
|
+
i++;
|
|
643
|
+
}
|
|
644
|
+
flush_inline_p(&ib, out);
|
|
645
|
+
free(ib.data);
|
|
646
|
+
continue;
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
walk_node(child, out);
|
|
650
|
+
i++;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/* ------------------------------------------------------------------ */
|
|
655
|
+
/* walk_node — process a single DOM node */
|
|
656
|
+
/* ------------------------------------------------------------------ */
|
|
657
|
+
|
|
658
|
+
static void walk_node(GumboNode *node, buffer_t *out) {
|
|
659
|
+
if (!node)
|
|
660
|
+
return;
|
|
661
|
+
|
|
662
|
+
/* Text node */
|
|
663
|
+
if (is_text(node)) {
|
|
664
|
+
const char *text_raw = node->v.text.text;
|
|
665
|
+
if (text_raw) {
|
|
666
|
+
size_t text_len = strlen(text_raw);
|
|
667
|
+
for (size_t i = 0; i < text_len; i++) {
|
|
668
|
+
char c = text_raw[i];
|
|
669
|
+
switch (c) {
|
|
670
|
+
case '<':
|
|
671
|
+
buffer_append_str(out, "<");
|
|
672
|
+
break;
|
|
673
|
+
case '>':
|
|
674
|
+
buffer_append_str(out, ">");
|
|
675
|
+
break;
|
|
676
|
+
case '&':
|
|
677
|
+
buffer_append_str(out, "&");
|
|
678
|
+
break;
|
|
679
|
+
default:
|
|
680
|
+
buffer_append(out, &c, 1);
|
|
681
|
+
break;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
return;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
if (!is_element(node)) {
|
|
689
|
+
walk_children(node, out);
|
|
690
|
+
return;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
GumboElement *el = &node->v.element;
|
|
694
|
+
char name_buf[64];
|
|
695
|
+
if (!get_tag_name(node, name_buf, sizeof(name_buf))) {
|
|
696
|
+
walk_children(node, out);
|
|
697
|
+
return;
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
/* Strip <meta>, <style>, <script>, <title>, <link> */
|
|
701
|
+
if (strcmp(name_buf, "meta") == 0 || strcmp(name_buf, "style") == 0 ||
|
|
702
|
+
strcmp(name_buf, "script") == 0 || strcmp(name_buf, "title") == 0 ||
|
|
703
|
+
strcmp(name_buf, "link") == 0)
|
|
704
|
+
return;
|
|
705
|
+
|
|
706
|
+
/* Google Docs wrapper */
|
|
707
|
+
if (is_google_docs_wrapper(el, name_buf)) {
|
|
708
|
+
walk_children(node, out);
|
|
709
|
+
return;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
const char *out_name = canonical_name(name_buf);
|
|
713
|
+
if (!out_name)
|
|
714
|
+
out_name = name_buf;
|
|
715
|
+
|
|
716
|
+
tag_class_t cls = classify_tag(name_buf);
|
|
717
|
+
|
|
718
|
+
/* --- <span>: CSS style → inline tags --- */
|
|
719
|
+
if (strcmp(name_buf, "span") == 0) {
|
|
720
|
+
const char *sval = get_attr(el, "style");
|
|
721
|
+
size_t slen = sval ? strlen(sval) : 0;
|
|
722
|
+
css_styles_t s = parse_css_style(sval, slen);
|
|
723
|
+
emit_styles_open(out, s);
|
|
724
|
+
walk_children(node, out);
|
|
725
|
+
emit_styles_close(out, s);
|
|
726
|
+
return;
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
/* --- <div>: becomes <p> or passes through --- */
|
|
730
|
+
if (strcmp(name_buf, "div") == 0) {
|
|
731
|
+
const char *sval = get_attr(el, "style");
|
|
732
|
+
size_t slen = sval ? strlen(sval) : 0;
|
|
733
|
+
css_styles_t s = parse_css_style(sval, slen);
|
|
734
|
+
|
|
735
|
+
if (is_purely_inline(node)) {
|
|
736
|
+
/* Split on <br> into separate <p>s */
|
|
737
|
+
buffer_t pb = buffer_create(64);
|
|
738
|
+
GumboVector *div_children = &el->children;
|
|
739
|
+
for (unsigned int di = 0; di < div_children->length; di++) {
|
|
740
|
+
GumboNode *dc = div_children->data[di];
|
|
741
|
+
if (is_br_node(dc)) {
|
|
742
|
+
if (pb.len > 0) {
|
|
743
|
+
buffer_append_str(out, "<p>");
|
|
744
|
+
emit_styles_open(out, s);
|
|
745
|
+
buffer_append(out, pb.data, pb.len);
|
|
746
|
+
emit_styles_close(out, s);
|
|
747
|
+
buffer_append_str(out, "</p>");
|
|
748
|
+
} else {
|
|
749
|
+
buffer_append_str(out, "<br>");
|
|
750
|
+
}
|
|
751
|
+
buffer_clear(&pb);
|
|
752
|
+
continue;
|
|
753
|
+
}
|
|
754
|
+
walk_node(dc, &pb);
|
|
755
|
+
}
|
|
756
|
+
if (pb.len > 0) {
|
|
757
|
+
buffer_append_str(out, "<p>");
|
|
758
|
+
emit_styles_open(out, s);
|
|
759
|
+
buffer_append(out, pb.data, pb.len);
|
|
760
|
+
emit_styles_close(out, s);
|
|
761
|
+
buffer_append_str(out, "</p>");
|
|
762
|
+
}
|
|
763
|
+
free(pb.data);
|
|
764
|
+
} else {
|
|
765
|
+
emit_styles_open(out, s);
|
|
766
|
+
walk_children(node, out);
|
|
767
|
+
emit_styles_close(out, s);
|
|
768
|
+
}
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
/* --- Table elements --- */
|
|
773
|
+
if (strcmp(name_buf, "table") == 0 || strcmp(name_buf, "thead") == 0 ||
|
|
774
|
+
strcmp(name_buf, "tbody") == 0 || strcmp(name_buf, "tfoot") == 0 ||
|
|
775
|
+
strcmp(name_buf, "tr") == 0 || strcmp(name_buf, "td") == 0 ||
|
|
776
|
+
strcmp(name_buf, "th") == 0 || strcmp(name_buf, "caption") == 0 ||
|
|
777
|
+
strcmp(name_buf, "colgroup") == 0 || strcmp(name_buf, "col") == 0) {
|
|
778
|
+
if (strcmp(name_buf, "td") == 0 || strcmp(name_buf, "th") == 0) {
|
|
779
|
+
walk_children(node, out);
|
|
780
|
+
/* Check if there's a next sibling element */
|
|
781
|
+
GumboNode *parent = node->parent;
|
|
782
|
+
if (parent && is_element(parent)) {
|
|
783
|
+
GumboVector *siblings = &parent->v.element.children;
|
|
784
|
+
unsigned int my_idx = node->index_within_parent;
|
|
785
|
+
/* Find next element sibling */
|
|
786
|
+
bool has_next_el = false;
|
|
787
|
+
for (unsigned int si = my_idx + 1; si < siblings->length; si++) {
|
|
788
|
+
GumboNode *sib = siblings->data[si];
|
|
789
|
+
if (is_element(sib)) {
|
|
790
|
+
has_next_el = true;
|
|
791
|
+
break;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
if (has_next_el)
|
|
795
|
+
buffer_append_str(out, " ");
|
|
796
|
+
}
|
|
797
|
+
} else if (strcmp(name_buf, "tr") == 0) {
|
|
798
|
+
buffer_t row = buffer_create(64);
|
|
799
|
+
walk_children(node, &row);
|
|
800
|
+
if (row.len > 0) {
|
|
801
|
+
buffer_append_str(out, "<p>");
|
|
802
|
+
buffer_append(out, row.data, row.len);
|
|
803
|
+
buffer_append_str(out, "</p>");
|
|
804
|
+
}
|
|
805
|
+
free(row.data);
|
|
806
|
+
} else {
|
|
807
|
+
walk_children(node, out);
|
|
808
|
+
}
|
|
809
|
+
return;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
/* --- Remaining tags handled by class --- */
|
|
813
|
+
switch (cls) {
|
|
814
|
+
case TAG_CLASS_PASS:
|
|
815
|
+
case TAG_CLASS_SKIP:
|
|
816
|
+
walk_children(node, out);
|
|
817
|
+
break;
|
|
818
|
+
|
|
819
|
+
case TAG_CLASS_SELF_CLOSING:
|
|
820
|
+
buffer_append_str(out, "<");
|
|
821
|
+
buffer_append_str(out, out_name);
|
|
822
|
+
emit_attributes(el, out_name, out);
|
|
823
|
+
buffer_append_str(out, strcmp(out_name, "img") == 0 ? " />" : ">");
|
|
824
|
+
break;
|
|
825
|
+
|
|
826
|
+
case TAG_CLASS_INLINE:
|
|
827
|
+
case TAG_CLASS_BLOCK: {
|
|
828
|
+
const char *sval = get_attr(el, "style");
|
|
829
|
+
size_t slen = sval ? strlen(sval) : 0;
|
|
830
|
+
css_styles_t es = extra_styles(parse_css_style(sval, slen), out_name);
|
|
831
|
+
|
|
832
|
+
/* <li>: always flatten */
|
|
833
|
+
if (strcmp(out_name, "li") == 0) {
|
|
834
|
+
GumboNode *nested_lists[16];
|
|
835
|
+
int nested_count = 0;
|
|
836
|
+
buffer_t li_ib = buffer_create(64);
|
|
837
|
+
li_ctx_t ctx = {el, es, nested_lists, &nested_count, 16};
|
|
838
|
+
flatten_li_children(node, &li_ib, out, &ctx);
|
|
839
|
+
flush_li_buffer(&li_ib, out, &ctx);
|
|
840
|
+
free(li_ib.data);
|
|
841
|
+
for (int k = 0; k < nested_count; k++)
|
|
842
|
+
walk_children(nested_lists[k], out);
|
|
843
|
+
break;
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
/* <codeblock>: wrap inline content in <p> */
|
|
847
|
+
if (strcmp(out_name, "codeblock") == 0) {
|
|
848
|
+
bool wrap = is_purely_inline(node);
|
|
849
|
+
buffer_append_str(out, "<codeblock>");
|
|
850
|
+
if (wrap)
|
|
851
|
+
buffer_append_str(out, "<p>");
|
|
852
|
+
walk_children(node, out);
|
|
853
|
+
if (wrap)
|
|
854
|
+
buffer_append_str(out, "</p>");
|
|
855
|
+
buffer_append_str(out, "</codeblock>");
|
|
856
|
+
break;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
/* Generic block/inline tag */
|
|
860
|
+
buffer_append_str(out, "<");
|
|
861
|
+
buffer_append_str(out, out_name);
|
|
862
|
+
emit_attributes(el, out_name, out);
|
|
863
|
+
buffer_append_str(out, ">");
|
|
864
|
+
emit_styles_open(out, es);
|
|
865
|
+
walk_children(node, out);
|
|
866
|
+
emit_styles_close(out, es);
|
|
867
|
+
buffer_append_str(out, "</");
|
|
868
|
+
buffer_append_str(out, out_name);
|
|
869
|
+
buffer_append_str(out, ">");
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
/* ------------------------------------------------------------------ */
|
|
876
|
+
/* Find <body> element from parse output */
|
|
877
|
+
/* ------------------------------------------------------------------ */
|
|
878
|
+
|
|
879
|
+
static GumboNode *find_body(GumboNode *root) {
|
|
880
|
+
if (!root || !is_element(root))
|
|
881
|
+
return root;
|
|
882
|
+
GumboVector *children = &root->v.element.children;
|
|
883
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
|
884
|
+
GumboNode *child = children->data[i];
|
|
885
|
+
if (is_element(child) && child->v.element.tag == GUMBO_TAG_BODY)
|
|
886
|
+
return child;
|
|
887
|
+
}
|
|
888
|
+
return root;
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/* ------------------------------------------------------------------ */
|
|
892
|
+
/* Public API */
|
|
893
|
+
/* ------------------------------------------------------------------ */
|
|
894
|
+
|
|
895
|
+
char *normalize_html(const char *html, size_t len) {
|
|
896
|
+
if (!html || len == 0)
|
|
897
|
+
return NULL;
|
|
898
|
+
|
|
899
|
+
GumboOutput *output =
|
|
900
|
+
gumbo_parse_with_options(&kGumboDefaultOptions, html, len);
|
|
901
|
+
if (!output)
|
|
902
|
+
return NULL;
|
|
903
|
+
|
|
904
|
+
GumboNode *body = find_body(output->root);
|
|
905
|
+
if (!body)
|
|
906
|
+
body = output->root;
|
|
907
|
+
|
|
908
|
+
buffer_t buf = buffer_create(len * 2);
|
|
909
|
+
walk_children(body, &buf);
|
|
910
|
+
|
|
911
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
|
912
|
+
return buffer_finish(&buf);
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
void free_normalized_html(char *result) { free(result); }
|