apex-ruby 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +119 -0
- data/apex-ruby.gemspec +31 -0
- data/ext/apex_ext/apex_ext.c +215 -0
- data/ext/apex_ext/apex_src/BENCHMARK.md +32 -0
- data/ext/apex_ext/apex_src/BENCHMARK_COMPARISON.md +67 -0
- data/ext/apex_ext/apex_src/CHANGELOG.md +2454 -0
- data/ext/apex_ext/apex_src/CMakeLists.txt +454 -0
- data/ext/apex_ext/apex_src/Dockerfile.linux-build +15 -0
- data/ext/apex_ext/apex_src/Formula/apex.rb +38 -0
- data/ext/apex_ext/apex_src/Info.plist.in +27 -0
- data/ext/apex_ext/apex_src/LICENSE +21 -0
- data/ext/apex_ext/apex_src/Package.swift +160 -0
- data/ext/apex_ext/apex_src/PackageSupport/README.md +17 -0
- data/ext/apex_ext/apex_src/PackageSupport/cmark-gfm/cmark-gfm_export.h +20 -0
- data/ext/apex_ext/apex_src/PackageSupport/cmark-gfm/cmark-gfm_version.h +14 -0
- data/ext/apex_ext/apex_src/PackageSupport/cmark-gfm/cmark_gfm_spm_stub.c +4 -0
- data/ext/apex_ext/apex_src/PackageSupport/cmark-gfm/config.h +41 -0
- data/ext/apex_ext/apex_src/README.md +452 -0
- data/ext/apex_ext/apex_src/VERSION +1 -0
- data/ext/apex_ext/apex_src/apex-header-2-rb@2x.webp +0 -0
- data/ext/apex_ext/apex_src/apex-plugins.json.example +20 -0
- data/ext/apex_ext/apex_src/apex.pc.in +11 -0
- data/ext/apex_ext/apex_src/cli/main.c +2720 -0
- data/ext/apex_ext/apex_src/debug_test.sh +22 -0
- data/ext/apex_ext/apex_src/docs/API_REFERENCE.md +451 -0
- data/ext/apex_ext/apex_src/docs/ARCHITECTURE.md +166 -0
- data/ext/apex_ext/apex_src/docs/CMARK_INTEGRATION.md +220 -0
- data/ext/apex_ext/apex_src/docs/CRITICMARKUP.md +501 -0
- data/ext/apex_ext/apex_src/docs/DEBUGGING.md +73 -0
- data/ext/apex_ext/apex_src/docs/FINAL_STATUS.md +391 -0
- data/ext/apex_ext/apex_src/docs/FINAL_STATUS_UPDATE.md +237 -0
- data/ext/apex_ext/apex_src/docs/FUTURE_FEATURES.md +456 -0
- data/ext/apex_ext/apex_src/docs/IAL_FEATURES.md +210 -0
- data/ext/apex_ext/apex_src/docs/IAL_STATUS.md +344 -0
- data/ext/apex_ext/apex_src/docs/INTEGRATION_EXAMPLE.m +144 -0
- data/ext/apex_ext/apex_src/docs/LIMITATIONS_RESOLVED.md +278 -0
- data/ext/apex_ext/apex_src/docs/OUTPUT_MODES.md +321 -0
- data/ext/apex_ext/apex_src/docs/PROGRESS.md +167 -0
- data/ext/apex_ext/apex_src/docs/STANDALONE_FEATURE.md +174 -0
- data/ext/apex_ext/apex_src/docs/TABLE_SPANS_STATUS.md +243 -0
- data/ext/apex_ext/apex_src/docs/TEST_COVERAGE.md +316 -0
- data/ext/apex_ext/apex_src/docs/USER_GUIDE.md +803 -0
- data/ext/apex_ext/apex_src/docs/WIKI_LINKS_ISSUE.md +91 -0
- data/ext/apex_ext/apex_src/documentation/README.md +160 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex Command Line Options.cheatsheet.txt +365 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Info.plist +24 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/C-API.html +1737 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Citations.html +1420 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Command-Line-Options.html +3574 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Configuration.html +1603 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Credits.html +910 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Examples.html +1168 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Getting-Started.html +1003 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Header-IDs.html +1308 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Home.html +1078 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Inline-Attribute-Lists.html +1622 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Installation.html +1168 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Limitations-and-Roadmap.html +1698 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Metadata-Transforms.html +1531 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Modes.html +1980 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Multi-File-Documents.html +1368 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Pandoc-Integration.html +1151 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Plugins.html +2861 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Syntax.html +3981 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Troubleshooting.html +1454 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Usage.html +1200 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/Documents/Xcode-Integration.html +2066 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/docSet.dsidx +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/optimizedIndex.dsidx +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/Apex.docset/Contents/Resources/tempOptimizedIndex.dsidx +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Info.plist +22 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Bold.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Bold_Italic.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Extrabold.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Extrabold_Italic.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Italic.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Semibold.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/cheatset_resources/Open_Sans_Semibold_Italic.woff +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/index.html +914 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/Documents/style.css +399 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/docSet.dsidx +0 -0
- data/ext/apex_ext/apex_src/documentation/docsets/ApexCLI.docset/Contents/Resources/optimizedIndex.dsidx +0 -0
- data/ext/apex_ext/apex_src/documentation/generate_app_docs.rb +772 -0
- data/ext/apex_ext/apex_src/documentation/generate_app_docs_ai.rb +678 -0
- data/ext/apex_ext/apex_src/documentation/generate_docset.rb +873 -0
- data/ext/apex_ext/apex_src/documentation/generate_single_html.rb +733 -0
- data/ext/apex_ext/apex_src/documentation/html/apex-docs.html +17073 -0
- data/ext/apex_ext/apex_src/documentation/shared_scripts.js +64 -0
- data/ext/apex_ext/apex_src/documentation/shared_styles.css +646 -0
- data/ext/apex_ext/apex_src/documentation/transform_for_app.example.md +260 -0
- data/ext/apex_ext/apex_src/examples/bracketed_spans_demo.md +119 -0
- data/ext/apex_ext/apex_src/examples/emoji_span_plugin.yml +11 -0
- data/ext/apex_ext/apex_src/examples/example.html +53 -0
- data/ext/apex_ext/apex_src/examples/example.md +85 -0
- data/ext/apex_ext/apex_src/examples/fenced_divs_demo.md +158 -0
- data/ext/apex_ext/apex_src/examples/kbd.md +8 -0
- data/ext/apex_ext/apex_src/examples/kbd_plugin.rb +250 -0
- data/ext/apex_ext/apex_src/examples/kbd_plugin.yml +9 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-black.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-black@2x.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-mark.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-mark@2x.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-white.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon-outline-white@2x.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon.png +0 -0
- data/ext/apex_ext/apex_src/icon/apexicon@2x.png +0 -0
- data/ext/apex_ext/apex_src/include/apex/apex.h +247 -0
- data/ext/apex_ext/apex_src/include/apex/buffer.h +93 -0
- data/ext/apex_ext/apex_src/include/apex/module.modulemap +16 -0
- data/ext/apex_ext/apex_src/include/apex/parser.h +150 -0
- data/ext/apex_ext/apex_src/include/apex/renderer.h +39 -0
- data/ext/apex_ext/apex_src/man/apex-config.5 +374 -0
- data/ext/apex_ext/apex_src/man/apex-config.5.md +260 -0
- data/ext/apex_ext/apex_src/man/apex-plugins.7 +456 -0
- data/ext/apex_ext/apex_src/man/apex-plugins.7.md +365 -0
- data/ext/apex_ext/apex_src/man/apex.1 +828 -0
- data/ext/apex_ext/apex_src/man/apex.1.md +643 -0
- data/ext/apex_ext/apex_src/man/apex.1.new +338 -0
- data/ext/apex_ext/apex_src/objc/Apex.swift +237 -0
- data/ext/apex_ext/apex_src/objc/NSString+Apex.h +117 -0
- data/ext/apex_ext/apex_src/objc/NSString+Apex.m +332 -0
- data/ext/apex_ext/apex_src/src/_README.md +358 -0
- data/ext/apex_ext/apex_src/src/apex.c +6326 -0
- data/ext/apex_ext/apex_src/src/buffer.c +93 -0
- data/ext/apex_ext/apex_src/src/extensions/abbreviations.c +362 -0
- data/ext/apex_ext/apex_src/src/extensions/abbreviations.h +45 -0
- data/ext/apex_ext/apex_src/src/extensions/advanced_footnotes.c +184 -0
- data/ext/apex_ext/apex_src/src/extensions/advanced_footnotes.h +50 -0
- data/ext/apex_ext/apex_src/src/extensions/advanced_tables.c +1897 -0
- data/ext/apex_ext/apex_src/src/extensions/advanced_tables.h +42 -0
- data/ext/apex_ext/apex_src/src/extensions/callouts.c +215 -0
- data/ext/apex_ext/apex_src/src/extensions/callouts.h +53 -0
- data/ext/apex_ext/apex_src/src/extensions/citations.c +2042 -0
- data/ext/apex_ext/apex_src/src/extensions/citations.h +163 -0
- data/ext/apex_ext/apex_src/src/extensions/critic.c +329 -0
- data/ext/apex_ext/apex_src/src/extensions/critic.h +48 -0
- data/ext/apex_ext/apex_src/src/extensions/definition_list.c +1670 -0
- data/ext/apex_ext/apex_src/src/extensions/definition_list.h +42 -0
- data/ext/apex_ext/apex_src/src/extensions/emoji.c +710 -0
- data/ext/apex_ext/apex_src/src/extensions/emoji.h +38 -0
- data/ext/apex_ext/apex_src/src/extensions/emoji_data.h +942 -0
- data/ext/apex_ext/apex_src/src/extensions/fenced_divs.c +925 -0
- data/ext/apex_ext/apex_src/src/extensions/fenced_divs.h +43 -0
- data/ext/apex_ext/apex_src/src/extensions/github-emoji.txt +869 -0
- data/ext/apex_ext/apex_src/src/extensions/grid_tables.c +1121 -0
- data/ext/apex_ext/apex_src/src/extensions/grid_tables.h +33 -0
- data/ext/apex_ext/apex_src/src/extensions/header_ids.c +626 -0
- data/ext/apex_ext/apex_src/src/extensions/header_ids.h +60 -0
- data/ext/apex_ext/apex_src/src/extensions/highlight.c +135 -0
- data/ext/apex_ext/apex_src/src/extensions/highlight.h +16 -0
- data/ext/apex_ext/apex_src/src/extensions/html_markdown.c +408 -0
- data/ext/apex_ext/apex_src/src/extensions/html_markdown.h +42 -0
- data/ext/apex_ext/apex_src/src/extensions/ial.c +4084 -0
- data/ext/apex_ext/apex_src/src/extensions/ial.h +145 -0
- data/ext/apex_ext/apex_src/src/extensions/includes.c +1536 -0
- data/ext/apex_ext/apex_src/src/extensions/includes.h +54 -0
- data/ext/apex_ext/apex_src/src/extensions/index.c +967 -0
- data/ext/apex_ext/apex_src/src/extensions/index.h +90 -0
- data/ext/apex_ext/apex_src/src/extensions/inline_footnotes.c +205 -0
- data/ext/apex_ext/apex_src/src/extensions/inline_footnotes.h +34 -0
- data/ext/apex_ext/apex_src/src/extensions/inline_tables.c +332 -0
- data/ext/apex_ext/apex_src/src/extensions/inline_tables.h +13 -0
- data/ext/apex_ext/apex_src/src/extensions/insert.c +248 -0
- data/ext/apex_ext/apex_src/src/extensions/insert.h +18 -0
- data/ext/apex_ext/apex_src/src/extensions/math.c +279 -0
- data/ext/apex_ext/apex_src/src/extensions/math.h +32 -0
- data/ext/apex_ext/apex_src/src/extensions/metadata.c +3046 -0
- data/ext/apex_ext/apex_src/src/extensions/metadata.h +125 -0
- data/ext/apex_ext/apex_src/src/extensions/relaxed_tables.c +1297 -0
- data/ext/apex_ext/apex_src/src/extensions/relaxed_tables.h +39 -0
- data/ext/apex_ext/apex_src/src/extensions/special_markers.c +194 -0
- data/ext/apex_ext/apex_src/src/extensions/special_markers.h +29 -0
- data/ext/apex_ext/apex_src/src/extensions/sup_sub.c +405 -0
- data/ext/apex_ext/apex_src/src/extensions/sup_sub.h +16 -0
- data/ext/apex_ext/apex_src/src/extensions/syntax_highlight.c +468 -0
- data/ext/apex_ext/apex_src/src/extensions/syntax_highlight.h +44 -0
- data/ext/apex_ext/apex_src/src/extensions/table_html_postprocess.c +2679 -0
- data/ext/apex_ext/apex_src/src/extensions/table_html_postprocess.h +23 -0
- data/ext/apex_ext/apex_src/src/extensions/toc.c +255 -0
- data/ext/apex_ext/apex_src/src/extensions/toc.h +34 -0
- data/ext/apex_ext/apex_src/src/extensions/wiki_links.c +624 -0
- data/ext/apex_ext/apex_src/src/extensions/wiki_links.h +58 -0
- data/ext/apex_ext/apex_src/src/html_renderer.c +2762 -0
- data/ext/apex_ext/apex_src/src/html_renderer.h +126 -0
- data/ext/apex_ext/apex_src/src/parser.c +227 -0
- data/ext/apex_ext/apex_src/src/plugins.c +895 -0
- data/ext/apex_ext/apex_src/src/plugins.h +39 -0
- data/ext/apex_ext/apex_src/src/plugins_env.c +187 -0
- data/ext/apex_ext/apex_src/src/plugins_remote.c +263 -0
- data/ext/apex_ext/apex_src/src/pretty_html.c +358 -0
- data/ext/apex_ext/apex_src/src/renderer.c +241 -0
- data/ext/apex_ext/apex_src/src/utf8.c +56 -0
- data/ext/apex_ext/apex_src/test-linux-build.sh +20 -0
- data/ext/apex_ext/apex_src/test.html +103 -0
- data/ext/apex_ext/apex_src/test_coverage.sh +121 -0
- data/ext/apex_ext/apex_src/test_ial_fenced.md +6 -0
- data/ext/apex_ext/apex_src/test_math_norm.py +79 -0
- data/ext/apex_ext/apex_src/test_pandoc_output.html +48 -0
- data/ext/apex_ext/apex_src/test_spm.sh +107 -0
- data/ext/apex_ext/apex_src/tests/ApexSPMTest/main.swift +50 -0
- data/ext/apex_ext/apex_src/tests/BENCHMARK_RESULTS.md +229 -0
- data/ext/apex_ext/apex_src/tests/CMakeLists.txt +24 -0
- data/ext/apex_ext/apex_src/tests/README.md +146 -0
- data/ext/apex_ext/apex_src/tests/benchmark.sh +113 -0
- data/ext/apex_ext/apex_src/tests/benchmark_comparison.sh +166 -0
- data/ext/apex_ext/apex_src/tests/compare_header_ids.sh +31 -0
- data/ext/apex_ext/apex_src/tests/fixtures/basic/headers.md +25 -0
- data/ext/apex_ext/apex_src/tests/fixtures/basic/list-interruption.md +24 -0
- data/ext/apex_ext/apex_src/tests/fixtures/basic/misc_markup.md +33 -0
- data/ext/apex_ext/apex_src/tests/fixtures/basic/test_basic.md +26 -0
- data/ext/apex_ext/apex_src/tests/fixtures/code/code-blocks.md +260 -0
- data/ext/apex_ext/apex_src/tests/fixtures/combine_summary/SUMMARY.md +6 -0
- data/ext/apex_ext/apex_src/tests/fixtures/combine_summary/chapter1.md +7 -0
- data/ext/apex_ext/apex_src/tests/fixtures/combine_summary/index.txt +9 -0
- data/ext/apex_ext/apex_src/tests/fixtures/combine_summary/intro.md +5 -0
- data/ext/apex_ext/apex_src/tests/fixtures/combine_summary/section1_1.md +5 -0
- data/ext/apex_ext/apex_src/tests/fixtures/comprehensive_test.md +620 -0
- data/ext/apex_ext/apex_src/tests/fixtures/debug_ref_image_ial.md +3 -0
- data/ext/apex_ext/apex_src/tests/fixtures/demos/ial.md +11 -0
- data/ext/apex_ext/apex_src/tests/fixtures/demos/ial_demo.md +177 -0
- data/ext/apex_ext/apex_src/tests/fixtures/extensions/emoji-autocorrect.md +94 -0
- data/ext/apex_ext/apex_src/tests/fixtures/extensions/emoji_test.md +3 -0
- data/ext/apex_ext/apex_src/tests/fixtures/extensions/kbd_test.md +3 -0
- data/ext/apex_ext/apex_src/tests/fixtures/ial/bracketed_spans_test.md +74 -0
- data/ext/apex_ext/apex_src/tests/fixtures/images/image_and_encoding_test.md +27 -0
- data/ext/apex_ext/apex_src/tests/fixtures/images/multimarkdown_image_attributes_test.md +60 -0
- data/ext/apex_ext/apex_src/tests/fixtures/images/pandoc_ial_image_test.md +27 -0
- data/ext/apex_ext/apex_src/tests/fixtures/images/width_height_conversion_test.md +94 -0
- data/ext/apex_ext/apex_src/tests/fixtures/img-in-div.md +16 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/code.py +4 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/data.csv +5 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/data.tsv +5 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/image.png +2 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/metadata_options.yml +11 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/nested.md +8 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/raw.html +4 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/simple.md +7 -0
- data/ext/apex_ext/apex_src/tests/fixtures/includes/test_image.png +0 -0
- data/ext/apex_ext/apex_src/tests/fixtures/large_doc.md +1094 -0
- data/ext/apex_ext/apex_src/tests/fixtures/metadata_options.yml +11 -0
- data/ext/apex_ext/apex_src/tests/fixtures/output/gfm_header_id_test.md +96 -0
- data/ext/apex_ext/apex_src/tests/fixtures/output/test_citations.md +43 -0
- data/ext/apex_ext/apex_src/tests/fixtures/output/test_def_list_links.md +12 -0
- data/ext/apex_ext/apex_src/tests/fixtures/output/test_index_mmark.md +53 -0
- data/ext/apex_ext/apex_src/tests/fixtures/output/test_index_textindex.md +37 -0
- data/ext/apex_ext/apex_src/tests/fixtures/tables/advanced_tables_test.md +93 -0
- data/ext/apex_ext/apex_src/tests/fixtures/tables/inline_tables_test.md +38 -0
- data/ext/apex_ext/apex_src/tests/fixtures/tables/relaxed-table.md +12 -0
- data/ext/apex_ext/apex_src/tests/fixtures/tables/table_cr_line_endings.md +15 -0
- data/ext/apex_ext/apex_src/tests/fixtures/tables/table_no_trailing_newline.md +15 -0
- data/ext/apex_ext/apex_src/tests/generate_gfm_ids.sh +105 -0
- data/ext/apex_ext/apex_src/tests/generate_ial_demo.sh +143 -0
- data/ext/apex_ext/apex_src/tests/gfm_id_comparison_summary.md +96 -0
- data/ext/apex_ext/apex_src/tests/gh_api_test.md +6 -0
- data/ext/apex_ext/apex_src/tests/ial_demo.html +186 -0
- data/ext/apex_ext/apex_src/tests/include_code.py +19 -0
- data/ext/apex_ext/apex_src/tests/include_snippet.md +15 -0
- data/ext/apex_ext/apex_src/tests/multi_file_cli_test.sh +64 -0
- data/ext/apex_ext/apex_src/tests/sample_data.csv +7 -0
- data/ext/apex_ext/apex_src/tests/table_escaped_ltlt.md +4 -0
- data/ext/apex_ext/apex_src/tests/test_basic.c +74 -0
- data/ext/apex_ext/apex_src/tests/test_extensions.c +2116 -0
- data/ext/apex_ext/apex_src/tests/test_helpers.c +183 -0
- data/ext/apex_ext/apex_src/tests/test_helpers.h +91 -0
- data/ext/apex_ext/apex_src/tests/test_ial.c +282 -0
- data/ext/apex_ext/apex_src/tests/test_links.c +418 -0
- data/ext/apex_ext/apex_src/tests/test_marked_integration.c +265 -0
- data/ext/apex_ext/apex_src/tests/test_metadata.c +908 -0
- data/ext/apex_ext/apex_src/tests/test_output.c +1118 -0
- data/ext/apex_ext/apex_src/tests/test_plugins.c +219 -0
- data/ext/apex_ext/apex_src/tests/test_refs.bib +31 -0
- data/ext/apex_ext/apex_src/tests/test_runner.c +244 -0
- data/ext/apex_ext/apex_src/tests/test_syntax_highlight.c +198 -0
- data/ext/apex_ext/apex_src/tests/test_tables.c +862 -0
- data/ext/apex_ext/apex_src/tests/update_benchmarks.sh +9 -0
- data/ext/apex_ext/apex_src/tests/yaml_test.md +13 -0
- data/ext/apex_ext/apex_src/tests.rb +39 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/CMakeLists.txt +48 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/COPYING +170 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/CheckFileOffsetBits.c +14 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/CheckFileOffsetBits.cmake +43 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/FindAsan.cmake +74 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/Makefile.nmake +38 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/README.md +206 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/CMakeLists.txt +30 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/cplusplus.cpp +15 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/cplusplus.h +16 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/harness.c +111 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/harness.h +35 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/api_test/main.c +1169 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/appveyor.yml +21 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-bq-flat.md +16 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-bq-nested.md +13 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-code.md +11 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-fences.md +14 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-heading.md +9 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-hr.md +10 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-html.md +32 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-lheading.md +8 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-list-flat.md +67 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-list-nested.md +36 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-ref-flat.md +15 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/block-ref-nested.md +17 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-autolink.md +14 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-backticks.md +3 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-em-flat.md +5 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-em-nested.md +5 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-em-worst.md +5 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-entity.md +11 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-escape.md +15 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-html.md +44 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-links-flat.md +23 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-links-nested.md +13 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/inline-newlines.md +24 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/lorem1.md +13 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/samples/rawtabs.md +18 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/statistics.py +595 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/bench/stats.py +19 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/benchmarks.md +33 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/changelog.txt +1245 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/data/CaseFolding.txt +1495 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/CMakeLists.txt +119 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/autolink.c +508 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/autolink.h +8 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/cmark-gfm-core-extensions.h +54 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/core-extensions.c +27 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/ext_scanners.c +879 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/ext_scanners.h +24 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/ext_scanners.re +92 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/strikethrough.c +167 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/strikethrough.h +9 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/table.c +917 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/table.h +12 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/tagfilter.c +60 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/tagfilter.h +8 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/tasklist.c +156 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/extensions/tasklist.h +8 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/fuzz/CMakeLists.txt +22 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/fuzz/README.md +12 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/fuzz/fuzz_quadratic.c +91 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/fuzz/fuzz_quadratic_brackets.c +110 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/fuzz/fuzzloop.sh +28 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/man/CMakeLists.txt +10 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/man/make_man_page.py +133 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/man/man1/cmark-gfm.1 +78 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/man/man3/cmark-gfm.3 +1041 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/nmake.bat +1 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/CMakeLists.txt +230 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/arena.c +104 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/blocks.c +1622 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/buffer.c +278 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/buffer.h +116 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/case_fold_switch.inc +4327 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/chunk.h +135 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark-gfm-extension_api.h +737 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark-gfm.h +833 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark-gfm_version.h.in +7 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark.c +55 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark_ctype.c +44 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/cmark_ctype.h +33 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/commonmark.c +514 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/config.h.in +76 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/entities.inc +2138 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/footnotes.c +63 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/footnotes.h +27 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/houdini.h +57 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/houdini_href_e.c +100 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/houdini_html_e.c +66 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/houdini_html_u.c +149 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/html.c +502 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/html.h +27 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/inlines.c +1788 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/inlines.h +29 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/iterator.c +159 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/iterator.h +26 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/latex.c +468 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/libcmark-gfm.pc.in +10 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/linked_list.c +37 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/main.c +328 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/man.c +274 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/map.c +129 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/map.h +44 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/node.c +1045 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/node.h +167 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/parser.h +59 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/plaintext.c +218 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/plugin.c +36 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/plugin.h +34 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/references.c +43 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/references.h +26 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/registry.c +63 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/registry.h +24 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/render.c +213 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/render.h +62 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/scanners.c +14056 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/scanners.h +70 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/scanners.re +365 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/syntax_extension.c +149 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/syntax_extension.h +34 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/utf8.c +317 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/utf8.h +35 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/src/xml.c +182 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/suppressions +10 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/CMakeLists.txt +114 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/afl_test_cases/test.md +49 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/cmark-fuzz.c +58 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/cmark.py +105 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/entity_tests.py +67 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/extensions-full-info-string.txt +0 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/extensions-table-prefer-style-attributes.txt +38 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/extensions.txt +920 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/fuzzing_dictionary +67 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/normalize.py +194 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/pathological_tests.py +160 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/regression.txt +375 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/roundtrip_tests.py +50 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/run-cmark-fuzz +4 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/smart_punct.txt +177 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/spec.txt +10212 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/test/spec_tests.py +152 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/toolchain-mingw32.cmake +17 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/Dockerfile +41 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/appveyor-build.bat +13 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/make_entities_inc.py +32 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/mkcasefold.pl +22 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/xml2md.xsl +319 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/tools/xml2md_gfm.xsl +80 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/why-cmark-and-not-x.md +104 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/wrappers/wrapper.js +6 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/wrappers/wrapper.py +37 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/wrappers/wrapper.rb +15 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/wrappers/wrapper.rkt +208 -0
- data/ext/apex_ext/apex_src/vendor/cmark-gfm/wrappers/wrapper_ext.py +109 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/CMakeLists.txt +160 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/Changes +372 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/License +20 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/Makefile.am +51 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/ReadMe.md +46 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/announcement.msg +89 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/bootstrap +3 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/cmake/config.h.in +4 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/configure.ac +73 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/doc/doxygen.cfg +222 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/docker/README.mkd +17 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/docker/alpine-3.7 +26 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/docker/fedora-25 +26 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/docker/ubuntu-14.04 +29 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/docker/ubuntu-16.04 +24 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/anchors.yaml +10 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/array.yaml +2 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/global-tag.yaml +14 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/json.yaml +1 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/mapping.yaml +2 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/numbers.yaml +1 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/strings.yaml +7 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/tags.yaml +7 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/examples/yaml-version.yaml +3 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/include/Makefile.am +17 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/include/yaml.h +1999 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/pkg/ReadMe.md +77 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/pkg/docker/Dockerfile +32 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/pkg/docker/output/ReadMe +1 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/pkg/docker/scripts/libyaml-dist.sh +23 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/regression-inputs/clusterfuzz-testcase-minimized-5607885063061504.yml +1 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/Makefile.am +4 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/api.c +1393 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/dumper.c +394 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/emitter.c +2358 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/loader.c +544 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/parser.c +1416 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/reader.c +469 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/scanner.c +3598 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/writer.c +141 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/src/yaml_private.h +684 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/CMakeLists.txt +27 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/Makefile.am +9 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/ReadMe.md +63 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/example-deconstructor-alt.c +800 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/example-deconstructor.c +1127 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/example-reformatter-alt.c +217 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/example-reformatter.c +202 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-all-tests.sh +29 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-dumper.c +314 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-emitter-test-suite.c +290 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-emitter.c +327 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-loader.c +63 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-parser-test-suite.c +196 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-parser.c +88 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/run-scanner.c +63 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/test-reader.c +354 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/tests/test-version.c +29 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/yaml-0.1.pc.in +10 -0
- data/ext/apex_ext/apex_src/vendor/libyaml/yamlConfig.cmake.in +16 -0
- data/ext/apex_ext/extconf.rb +103 -0
- data/lib/apex/configurable.rb +46 -0
- data/lib/apex/document.rb +66 -0
- data/lib/apex/version.rb +15 -0
- data/lib/apex.rb +28 -0
- metadata +544 -0
|
@@ -0,0 +1,2679 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Table HTML Postprocessing
|
|
3
|
+
* Implementation
|
|
4
|
+
*
|
|
5
|
+
* This is a pragmatic solution: we walk the AST to collect cells with
|
|
6
|
+
* rowspan/colspan attributes, then do pattern matching on the HTML to inject them.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "table_html_postprocess.h"
|
|
10
|
+
#include "cmark-gfm-core-extensions.h"
|
|
11
|
+
#include "table.h" /* For CMARK_NODE_TABLE_ROW, CMARK_NODE_TABLE_CELL */
|
|
12
|
+
#include <stdio.h>
|
|
13
|
+
#include <stdlib.h>
|
|
14
|
+
#include <string.h>
|
|
15
|
+
#include <stdbool.h>
|
|
16
|
+
#include <ctype.h>
|
|
17
|
+
#include <time.h>
|
|
18
|
+
|
|
19
|
+
/* Placeholder for escaped \<< (literal <<). Must match table.c. No underscore so inline parser doesn't treat as emphasis. */
|
|
20
|
+
static const unsigned char ESCAPED_LTLT_PLACEHOLDER[] = "APEXLTLT";
|
|
21
|
+
#define ESCAPED_LTLT_PLACEHOLDER_LEN 8
|
|
22
|
+
#define ESCAPED_LTLT_REPLACEMENT "<<"
|
|
23
|
+
#define ESCAPED_LTLT_REPLACEMENT_LEN 8
|
|
24
|
+
|
|
25
|
+
/* Structure to track cells with attributes */
|
|
26
|
+
typedef struct cell_attr {
|
|
27
|
+
int table_index; /* Which table (0, 1, 2, ...) */
|
|
28
|
+
int row_index;
|
|
29
|
+
int col_index;
|
|
30
|
+
char *attributes; /* e.g. " rowspan=\"2\"" or " data-remove=\"true\"" */
|
|
31
|
+
char *cell_text; /* Store cell content for content-based matching */
|
|
32
|
+
struct cell_attr *next;
|
|
33
|
+
} cell_attr;
|
|
34
|
+
|
|
35
|
+
/* Structure to track rows that should be in tfoot */
|
|
36
|
+
typedef struct tfoot_row {
|
|
37
|
+
int table_index;
|
|
38
|
+
int row_index;
|
|
39
|
+
struct tfoot_row *next;
|
|
40
|
+
} tfoot_row;
|
|
41
|
+
|
|
42
|
+
/* Structure to track table captions */
|
|
43
|
+
typedef struct table_caption {
|
|
44
|
+
int table_index; /* Which table (0, 1, 2, ...) */
|
|
45
|
+
char *caption; /* Caption text */
|
|
46
|
+
struct table_caption *next;
|
|
47
|
+
} table_caption;
|
|
48
|
+
|
|
49
|
+
/* Structure to track paragraphs to remove (caption paragraphs) */
|
|
50
|
+
typedef struct para_to_remove {
|
|
51
|
+
int para_index; /* Which paragraph (0, 1, 2, ...) */
|
|
52
|
+
char *text_fingerprint; /* First 50 chars for matching */
|
|
53
|
+
struct para_to_remove *next;
|
|
54
|
+
} para_to_remove;
|
|
55
|
+
|
|
56
|
+
/* Structure to track all cells (for mapping calculation) */
|
|
57
|
+
typedef struct all_cell {
|
|
58
|
+
int table_index;
|
|
59
|
+
int row_index;
|
|
60
|
+
int col_index;
|
|
61
|
+
bool is_removed; /* true if marked with data-remove */
|
|
62
|
+
struct all_cell *next;
|
|
63
|
+
} all_cell;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Walk AST and collect all cells (for mapping calculation)
|
|
67
|
+
*/
|
|
68
|
+
static all_cell *collect_all_cells(cmark_node *document) {
|
|
69
|
+
all_cell *list = NULL;
|
|
70
|
+
|
|
71
|
+
cmark_iter *iter = cmark_iter_new(document);
|
|
72
|
+
cmark_event_type ev_type;
|
|
73
|
+
|
|
74
|
+
int table_index = -1; /* Track which table we're in */
|
|
75
|
+
int row_index = -1; /* Start at -1, will increment to 0 on first row */
|
|
76
|
+
int col_index = 0;
|
|
77
|
+
|
|
78
|
+
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
|
|
79
|
+
cmark_node *node = cmark_iter_get_node(iter);
|
|
80
|
+
cmark_node_type type = cmark_node_get_type(node);
|
|
81
|
+
|
|
82
|
+
if (ev_type == CMARK_EVENT_ENTER) {
|
|
83
|
+
if (type == CMARK_NODE_TABLE) {
|
|
84
|
+
table_index++; /* New table */
|
|
85
|
+
row_index = -1; /* Reset row index for new table */
|
|
86
|
+
} else if (type == CMARK_NODE_TABLE_ROW) {
|
|
87
|
+
row_index++; /* Increment for each row */
|
|
88
|
+
col_index = 0;
|
|
89
|
+
} else if (type == CMARK_NODE_TABLE_CELL) {
|
|
90
|
+
char *attrs = (char *)cmark_node_get_user_data(node);
|
|
91
|
+
bool is_removed = (attrs && strstr(attrs, "data-remove"));
|
|
92
|
+
|
|
93
|
+
/* Store ALL cells for mapping calculation */
|
|
94
|
+
all_cell *cell = malloc(sizeof(all_cell));
|
|
95
|
+
if (cell) {
|
|
96
|
+
cell->table_index = table_index;
|
|
97
|
+
cell->row_index = row_index;
|
|
98
|
+
cell->col_index = col_index;
|
|
99
|
+
cell->is_removed = is_removed;
|
|
100
|
+
cell->next = list;
|
|
101
|
+
list = cell;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
col_index++;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
cmark_iter_free(iter);
|
|
110
|
+
return list;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Walk AST and collect cells with attributes
|
|
115
|
+
*/
|
|
116
|
+
static cell_attr *collect_table_cell_attributes(cmark_node *document) {
|
|
117
|
+
cell_attr *list = NULL;
|
|
118
|
+
|
|
119
|
+
cmark_iter *iter = cmark_iter_new(document);
|
|
120
|
+
cmark_event_type ev_type;
|
|
121
|
+
|
|
122
|
+
int table_index = -1; /* Track which table we're in */
|
|
123
|
+
int row_index = -1; /* Start at -1, will increment to 0 on first row */
|
|
124
|
+
int col_index = 0;
|
|
125
|
+
|
|
126
|
+
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
|
|
127
|
+
cmark_node *node = cmark_iter_get_node(iter);
|
|
128
|
+
cmark_node_type type = cmark_node_get_type(node);
|
|
129
|
+
|
|
130
|
+
if (ev_type == CMARK_EVENT_ENTER) {
|
|
131
|
+
if (type == CMARK_NODE_TABLE) {
|
|
132
|
+
table_index++; /* New table */
|
|
133
|
+
row_index = -1; /* Reset row index for new table */
|
|
134
|
+
} else if (type == CMARK_NODE_TABLE_ROW) {
|
|
135
|
+
row_index++; /* Increment for each row */
|
|
136
|
+
col_index = 0;
|
|
137
|
+
|
|
138
|
+
/* Check if this row is marked as tfoot */
|
|
139
|
+
char *row_attrs = (char *)cmark_node_get_user_data(node);
|
|
140
|
+
if (row_attrs && strstr(row_attrs, "data-tfoot")) {
|
|
141
|
+
/* Store this row as a tfoot row */
|
|
142
|
+
tfoot_row *tfoot = malloc(sizeof(tfoot_row));
|
|
143
|
+
if (tfoot) {
|
|
144
|
+
tfoot->table_index = table_index;
|
|
145
|
+
tfoot->row_index = row_index;
|
|
146
|
+
/* We'll add this to a list later - for now just mark it on the row */
|
|
147
|
+
/* We'll check for it during HTML processing */
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
} else if (type == CMARK_NODE_TABLE_CELL) {
|
|
151
|
+
char *attrs = (char *)cmark_node_get_user_data(node);
|
|
152
|
+
if (attrs) {
|
|
153
|
+
/* Get cell content - check recursively for text nodes */
|
|
154
|
+
const char *cell_text = NULL;
|
|
155
|
+
cmark_node *child = cmark_node_first_child(node);
|
|
156
|
+
while (child) {
|
|
157
|
+
if (cmark_node_get_type(child) == CMARK_NODE_TEXT) {
|
|
158
|
+
cell_text = cmark_node_get_literal(child);
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
/* Check nested nodes (paragraphs, etc.) */
|
|
162
|
+
cmark_node *nested = cmark_node_first_child(child);
|
|
163
|
+
while (nested) {
|
|
164
|
+
if (cmark_node_get_type(nested) == CMARK_NODE_TEXT) {
|
|
165
|
+
cell_text = cmark_node_get_literal(nested);
|
|
166
|
+
break;
|
|
167
|
+
}
|
|
168
|
+
nested = cmark_node_next(nested);
|
|
169
|
+
}
|
|
170
|
+
if (cell_text) break;
|
|
171
|
+
child = cmark_node_next(child);
|
|
172
|
+
}
|
|
173
|
+
if (!cell_text) cell_text = "?";
|
|
174
|
+
|
|
175
|
+
/* Store this cell's attributes */
|
|
176
|
+
cell_attr *attr = malloc(sizeof(cell_attr));
|
|
177
|
+
if (attr) {
|
|
178
|
+
attr->table_index = table_index;
|
|
179
|
+
attr->row_index = row_index;
|
|
180
|
+
attr->col_index = col_index;
|
|
181
|
+
attr->attributes = strdup(attrs);
|
|
182
|
+
attr->cell_text = cell_text ? strdup(cell_text) : NULL;
|
|
183
|
+
attr->next = list;
|
|
184
|
+
list = attr;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
/* Count all cells (including removed ones) to match the column indices
|
|
188
|
+
* used in advanced_tables.c when finding target cells for rowspan.
|
|
189
|
+
* The HTML renderer will remove cells with data-remove, but we need to
|
|
190
|
+
* match based on the original column positions. */
|
|
191
|
+
col_index++;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
cmark_iter_free(iter);
|
|
197
|
+
return list;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Process cell alignment colons and return alignment style
|
|
202
|
+
* Detects leading/trailing colons (respecting escaped colons) and returns
|
|
203
|
+
* the appropriate text-align style string, or NULL if no alignment detected.
|
|
204
|
+
* Modifies content_start and content_end to remove the colons.
|
|
205
|
+
*
|
|
206
|
+
* Based on Jekyll Spaceship's handle_text_align function.
|
|
207
|
+
*
|
|
208
|
+
* @param content_start Pointer to start of cell content (after >)
|
|
209
|
+
* @param content_end Pointer to end of cell content (before </td> or </th>)
|
|
210
|
+
* @param align_out Output parameter: receives alignment string (must be freed) or NULL
|
|
211
|
+
* @return true if alignment was detected and processed, false otherwise
|
|
212
|
+
*/
|
|
213
|
+
static bool process_cell_alignment(const char **content_start, const char **content_end, char **align_out) {
|
|
214
|
+
if (!content_start || !content_end || !align_out) return false;
|
|
215
|
+
|
|
216
|
+
const char *start = *content_start;
|
|
217
|
+
const char *end = *content_end;
|
|
218
|
+
|
|
219
|
+
if (start >= end) return false;
|
|
220
|
+
|
|
221
|
+
/* Fast early exit: check if there's any colon in the content at all */
|
|
222
|
+
/* This avoids expensive scanning for cells that clearly don't have alignment */
|
|
223
|
+
bool has_colon = false;
|
|
224
|
+
for (const char *check = start; check < end; check++) {
|
|
225
|
+
if (*check == ':') {
|
|
226
|
+
has_colon = true;
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
if (!has_colon) {
|
|
231
|
+
*align_out = NULL;
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/* Check for leading colon (left or center align)
|
|
236
|
+
* Must be at start (after whitespace), not escaped, and not followed by another colon */
|
|
237
|
+
bool has_leading_colon = false;
|
|
238
|
+
const char *p = start;
|
|
239
|
+
while (p < end && isspace((unsigned char)*p)) p++; /* Skip leading whitespace */
|
|
240
|
+
|
|
241
|
+
if (p < end && *p == ':') {
|
|
242
|
+
/* Check if it's escaped (backslash before colon) */
|
|
243
|
+
bool is_escaped = (p > start && *(p - 1) == '\\');
|
|
244
|
+
/* Check if it's followed by another colon (:: means something else, not alignment) */
|
|
245
|
+
bool is_double_colon = (p + 1 < end && *(p + 1) == ':');
|
|
246
|
+
if (!is_escaped && !is_double_colon) {
|
|
247
|
+
has_leading_colon = true;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/* Check for trailing colon (right or center align)
|
|
252
|
+
* Must be at end (before whitespace) and not escaped */
|
|
253
|
+
bool has_trailing_colon = false;
|
|
254
|
+
p = end - 1;
|
|
255
|
+
while (p >= start && isspace((unsigned char)*p)) p--; /* Skip trailing whitespace */
|
|
256
|
+
|
|
257
|
+
if (p >= start && *p == ':') {
|
|
258
|
+
/* Check if it's escaped (backslash before colon) */
|
|
259
|
+
if (p == start || *(p - 1) != '\\') {
|
|
260
|
+
has_trailing_colon = true;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (!has_leading_colon && !has_trailing_colon) {
|
|
265
|
+
*align_out = NULL;
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/* Determine alignment */
|
|
270
|
+
const char *align_str = NULL;
|
|
271
|
+
if (has_leading_colon && has_trailing_colon) {
|
|
272
|
+
align_str = "text-align: center";
|
|
273
|
+
} else if (has_leading_colon) {
|
|
274
|
+
align_str = "text-align: left";
|
|
275
|
+
} else if (has_trailing_colon) {
|
|
276
|
+
align_str = "text-align: right";
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
if (align_str) {
|
|
280
|
+
*align_out = strdup(align_str);
|
|
281
|
+
|
|
282
|
+
/* Update content_start and content_end to remove the colons */
|
|
283
|
+
if (has_leading_colon) {
|
|
284
|
+
/* Skip leading whitespace and colon */
|
|
285
|
+
p = start;
|
|
286
|
+
while (p < end && isspace((unsigned char)*p)) p++;
|
|
287
|
+
if (p < end && *p == ':') {
|
|
288
|
+
*content_start = p + 1;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (has_trailing_colon) {
|
|
293
|
+
/* Skip trailing whitespace and colon */
|
|
294
|
+
p = end - 1;
|
|
295
|
+
while (p >= *content_start && isspace((unsigned char)*p)) p--;
|
|
296
|
+
if (p >= *content_start && *p == ':') {
|
|
297
|
+
*content_end = p;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
*align_out = NULL;
|
|
305
|
+
return false;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Get text fingerprint from paragraph node (first 50 chars for matching)
|
|
310
|
+
*/
|
|
311
|
+
static char *get_para_text_fingerprint(cmark_node *node) {
|
|
312
|
+
if (!node || cmark_node_get_type(node) != CMARK_NODE_PARAGRAPH) return NULL;
|
|
313
|
+
|
|
314
|
+
cmark_node *child = cmark_node_first_child(node);
|
|
315
|
+
if (child && cmark_node_get_type(child) == CMARK_NODE_TEXT) {
|
|
316
|
+
const char *text = cmark_node_get_literal(child);
|
|
317
|
+
if (text) {
|
|
318
|
+
size_t len = strlen(text);
|
|
319
|
+
if (len > 50) len = 50;
|
|
320
|
+
char *fingerprint = malloc(len + 1);
|
|
321
|
+
if (fingerprint) {
|
|
322
|
+
memcpy(fingerprint, text, len);
|
|
323
|
+
fingerprint[len] = '\0';
|
|
324
|
+
return fingerprint;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
return NULL;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Walk AST and collect table captions and paragraphs to remove
|
|
333
|
+
*/
|
|
334
|
+
static table_caption *collect_table_captions(cmark_node *document, para_to_remove **paras_to_remove) {
|
|
335
|
+
table_caption *list = NULL;
|
|
336
|
+
*paras_to_remove = NULL;
|
|
337
|
+
|
|
338
|
+
cmark_iter *iter = cmark_iter_new(document);
|
|
339
|
+
cmark_event_type ev_type;
|
|
340
|
+
|
|
341
|
+
int table_index = -1; /* Track which table we're in */
|
|
342
|
+
int para_index = -1; /* Track paragraph index */
|
|
343
|
+
|
|
344
|
+
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
|
|
345
|
+
cmark_node *node = cmark_iter_get_node(iter);
|
|
346
|
+
cmark_node_type type = cmark_node_get_type(node);
|
|
347
|
+
|
|
348
|
+
if (ev_type == CMARK_EVENT_ENTER) {
|
|
349
|
+
if (type == CMARK_NODE_TABLE) {
|
|
350
|
+
table_index++; /* New table */
|
|
351
|
+
|
|
352
|
+
/* Check for caption in user_data */
|
|
353
|
+
char *user_data = (char *)cmark_node_get_user_data(node);
|
|
354
|
+
bool caption_found = false;
|
|
355
|
+
if (user_data && strstr(user_data, "data-caption=")) {
|
|
356
|
+
caption_found = true;
|
|
357
|
+
/* Extract caption text - handle both " data-caption=" and just "data-caption=" */
|
|
358
|
+
char caption[512];
|
|
359
|
+
const char *caption_start = strstr(user_data, "data-caption=");
|
|
360
|
+
if (caption_start) {
|
|
361
|
+
caption_start += strlen("data-caption=");
|
|
362
|
+
/* Skip the opening quote */
|
|
363
|
+
if (*caption_start == '"') {
|
|
364
|
+
caption_start++;
|
|
365
|
+
/* Extract until closing quote */
|
|
366
|
+
int i = 0;
|
|
367
|
+
while (*caption_start && *caption_start != '"' && i < 511) {
|
|
368
|
+
caption[i++] = *caption_start++;
|
|
369
|
+
}
|
|
370
|
+
caption[i] = '\0';
|
|
371
|
+
if (i > 0) {
|
|
372
|
+
table_caption *cap = malloc(sizeof(table_caption));
|
|
373
|
+
if (cap) {
|
|
374
|
+
cap->table_index = table_index;
|
|
375
|
+
cap->caption = strdup(caption);
|
|
376
|
+
cap->next = list;
|
|
377
|
+
list = cap;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
/* If caption not found in user_data, check for caption paragraph before/after table */
|
|
384
|
+
/* This handles cases where IAL processing replaced user_data */
|
|
385
|
+
if (!caption_found) {
|
|
386
|
+
/* Check previous node for caption */
|
|
387
|
+
cmark_node *prev = cmark_node_previous(node);
|
|
388
|
+
if (prev && cmark_node_get_type(prev) == CMARK_NODE_PARAGRAPH) {
|
|
389
|
+
/* Check if previous paragraph is a caption */
|
|
390
|
+
cmark_node *text_node = cmark_node_first_child(prev);
|
|
391
|
+
if (text_node && cmark_node_get_type(text_node) == CMARK_NODE_TEXT) {
|
|
392
|
+
const char *text = cmark_node_get_literal(text_node);
|
|
393
|
+
if (text && text[0] == '[') {
|
|
394
|
+
const char *end = strchr(text + 1, ']');
|
|
395
|
+
if (end) {
|
|
396
|
+
const char *after = end + 1;
|
|
397
|
+
while (*after && isspace((unsigned char)*after)) after++;
|
|
398
|
+
if (*after == '\0') {
|
|
399
|
+
/* This is a caption - extract it */
|
|
400
|
+
size_t caption_len = end - text - 1;
|
|
401
|
+
char *caption = malloc(caption_len + 1);
|
|
402
|
+
if (caption) {
|
|
403
|
+
memcpy(caption, text + 1, caption_len);
|
|
404
|
+
caption[caption_len] = '\0';
|
|
405
|
+
table_caption *cap = malloc(sizeof(table_caption));
|
|
406
|
+
if (cap) {
|
|
407
|
+
cap->table_index = table_index;
|
|
408
|
+
cap->caption = caption;
|
|
409
|
+
cap->next = list;
|
|
410
|
+
list = cap;
|
|
411
|
+
caption_found = true;
|
|
412
|
+
} else {
|
|
413
|
+
free(caption);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
/* Also check next node for caption */
|
|
422
|
+
if (!caption_found) {
|
|
423
|
+
cmark_node *next = cmark_node_next(node);
|
|
424
|
+
if (next && cmark_node_get_type(next) == CMARK_NODE_PARAGRAPH) {
|
|
425
|
+
cmark_node *text_node = cmark_node_first_child(next);
|
|
426
|
+
if (text_node && cmark_node_get_type(text_node) == CMARK_NODE_TEXT) {
|
|
427
|
+
const char *text = cmark_node_get_literal(text_node);
|
|
428
|
+
if (text && text[0] == '[') {
|
|
429
|
+
const char *end = strchr(text + 1, ']');
|
|
430
|
+
if (end) {
|
|
431
|
+
const char *after = end + 1;
|
|
432
|
+
while (*after && isspace((unsigned char)*after)) after++;
|
|
433
|
+
if (*after == '\0') {
|
|
434
|
+
/* This is a caption - extract it */
|
|
435
|
+
size_t caption_len = end - text - 1;
|
|
436
|
+
char *caption = malloc(caption_len + 1);
|
|
437
|
+
if (caption) {
|
|
438
|
+
memcpy(caption, text + 1, caption_len);
|
|
439
|
+
caption[caption_len] = '\0';
|
|
440
|
+
table_caption *cap = malloc(sizeof(table_caption));
|
|
441
|
+
if (cap) {
|
|
442
|
+
cap->table_index = table_index;
|
|
443
|
+
cap->caption = caption;
|
|
444
|
+
cap->next = list;
|
|
445
|
+
list = cap;
|
|
446
|
+
} else {
|
|
447
|
+
free(caption);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
} else if (type == CMARK_NODE_PARAGRAPH) {
|
|
458
|
+
para_index++;
|
|
459
|
+
|
|
460
|
+
/* Check if this paragraph is marked for removal */
|
|
461
|
+
char *user_data = (char *)cmark_node_get_user_data(node);
|
|
462
|
+
if (user_data && strstr(user_data, "data-remove")) {
|
|
463
|
+
char *fingerprint = get_para_text_fingerprint(node);
|
|
464
|
+
if (fingerprint) {
|
|
465
|
+
para_to_remove *para = malloc(sizeof(para_to_remove));
|
|
466
|
+
if (para) {
|
|
467
|
+
para->para_index = para_index;
|
|
468
|
+
para->text_fingerprint = fingerprint;
|
|
469
|
+
para->next = *paras_to_remove;
|
|
470
|
+
*paras_to_remove = para;
|
|
471
|
+
} else {
|
|
472
|
+
free(fingerprint);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
cmark_iter_free(iter);
|
|
481
|
+
return list;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Inject attributes into HTML or remove cells
|
|
486
|
+
* Also wraps tables with captions in <figure> tags
|
|
487
|
+
*/
|
|
488
|
+
/**
|
|
489
|
+
* Collect rows that should be in tfoot sections
|
|
490
|
+
*/
|
|
491
|
+
static tfoot_row *collect_tfoot_rows(cmark_node *document) {
|
|
492
|
+
tfoot_row *list = NULL;
|
|
493
|
+
|
|
494
|
+
cmark_iter *iter = cmark_iter_new(document);
|
|
495
|
+
cmark_event_type ev_type;
|
|
496
|
+
|
|
497
|
+
int table_index = -1;
|
|
498
|
+
int row_index = -1;
|
|
499
|
+
|
|
500
|
+
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
|
|
501
|
+
cmark_node *node = cmark_iter_get_node(iter);
|
|
502
|
+
cmark_node_type type = cmark_node_get_type(node);
|
|
503
|
+
|
|
504
|
+
if (ev_type == CMARK_EVENT_ENTER) {
|
|
505
|
+
if (type == CMARK_NODE_TABLE) {
|
|
506
|
+
table_index++;
|
|
507
|
+
row_index = -1;
|
|
508
|
+
} else if (type == CMARK_NODE_TABLE_ROW) {
|
|
509
|
+
row_index++;
|
|
510
|
+
char *row_attrs = (char *)cmark_node_get_user_data(node);
|
|
511
|
+
if (row_attrs && strstr(row_attrs, "data-tfoot")) {
|
|
512
|
+
tfoot_row *tfoot = malloc(sizeof(tfoot_row));
|
|
513
|
+
if (tfoot) {
|
|
514
|
+
tfoot->table_index = table_index;
|
|
515
|
+
tfoot->row_index = row_index;
|
|
516
|
+
tfoot->next = list;
|
|
517
|
+
list = tfoot;
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
cmark_iter_free(iter);
|
|
525
|
+
return list;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
char *apex_inject_table_attributes(const char *html, cmark_node *document, int caption_position) {
|
|
529
|
+
if (!html || !document) return (char *)html;
|
|
530
|
+
|
|
531
|
+
/* Collect all cells with attributes */
|
|
532
|
+
cell_attr *attrs = collect_table_cell_attributes(document);
|
|
533
|
+
|
|
534
|
+
/* Collect tfoot rows */
|
|
535
|
+
tfoot_row *tfoot_rows = collect_tfoot_rows(document);
|
|
536
|
+
|
|
537
|
+
/* Collect all table captions and paragraphs to remove */
|
|
538
|
+
para_to_remove *paras_to_remove = NULL;
|
|
539
|
+
table_caption *captions = collect_table_captions(document, ¶s_to_remove);
|
|
540
|
+
|
|
541
|
+
/* Early exit: if there are no attributes, captions, or tfoot rows to process,
|
|
542
|
+
* and no alignment colons, return early to avoid expensive processing.
|
|
543
|
+
* This avoids the expensive collect_all_cells() call and HTML processing for simple tables. */
|
|
544
|
+
bool needs_all_cells = (attrs != NULL || captions != NULL || paras_to_remove != NULL || tfoot_rows != NULL);
|
|
545
|
+
bool has_alignment_colons = false;
|
|
546
|
+
|
|
547
|
+
/* For very large HTML (>50KB), check if we can skip processing entirely */
|
|
548
|
+
size_t html_len = strlen(html);
|
|
549
|
+
if (html_len > 50000 && !needs_all_cells) {
|
|
550
|
+
/* Quick check for alignment colons - if none found, skip everything */
|
|
551
|
+
has_alignment_colons = (strstr(html, ":</td>") != NULL || strstr(html, ":</th>") != NULL);
|
|
552
|
+
/* Also check for rowspan markers (^^) - these need processing */
|
|
553
|
+
bool has_rowspan_markers = (strstr(html, "^^") != NULL);
|
|
554
|
+
if (!has_alignment_colons && !has_rowspan_markers) {
|
|
555
|
+
return (char *)html;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/* DEBUG: If there are no attributes at all, we can skip most processing */
|
|
560
|
+
if (attrs == NULL && captions == NULL && paras_to_remove == NULL && tfoot_rows == NULL) {
|
|
561
|
+
/* Check for alignment colons */
|
|
562
|
+
has_alignment_colons = (strstr(html, ":</td>") != NULL || strstr(html, ":</th>") != NULL);
|
|
563
|
+
/* Also check for rowspan markers (^^) - these need processing even without attributes */
|
|
564
|
+
bool has_rowspan_markers = (strstr(html, "^^") != NULL);
|
|
565
|
+
/* Check for empty first header cell (<th></th> in thead) - this indicates row-header detection is needed */
|
|
566
|
+
/* Simple check: if we have <thead> with a <th></th> pattern, we need to process for row-header detection */
|
|
567
|
+
/* Note: We always process tables with <thead> to check for row-header detection, since this happens
|
|
568
|
+
* during HTML processing and we can't detect it from AST attributes alone */
|
|
569
|
+
bool needs_row_header_detection = (strstr(html, "<thead>") != NULL);
|
|
570
|
+
if (!has_alignment_colons && !has_rowspan_markers && !needs_row_header_detection) {
|
|
571
|
+
/* Absolutely nothing to process - return immediately */
|
|
572
|
+
return (char *)html;
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
if (!needs_all_cells) {
|
|
577
|
+
/* Quick check for alignment colons - look for :</td> or :</th> patterns in rendered HTML */
|
|
578
|
+
/* Check for trailing colon alignment (most common) */
|
|
579
|
+
has_alignment_colons = (strstr(html, ":</td>") != NULL || strstr(html, ":</th>") != NULL);
|
|
580
|
+
if (!has_alignment_colons) {
|
|
581
|
+
/* Also check for leading colon alignment - but be more specific to avoid false positives */
|
|
582
|
+
/* Look for pattern like ": text</td>" or ": text</th>" (colon, space, text, closing tag) */
|
|
583
|
+
const char *colon_pos = strstr(html, ":<");
|
|
584
|
+
if (colon_pos) {
|
|
585
|
+
/* Check if it's followed by a closing tag within reasonable distance */
|
|
586
|
+
const char *check = colon_pos + 2;
|
|
587
|
+
int distance = 0;
|
|
588
|
+
while (*check && distance < 200) {
|
|
589
|
+
if (strncmp(check, "</td>", 5) == 0 || strncmp(check, "</th>", 5) == 0) {
|
|
590
|
+
has_alignment_colons = true;
|
|
591
|
+
break;
|
|
592
|
+
}
|
|
593
|
+
check++;
|
|
594
|
+
distance++;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
/* Also check for rowspan markers (^^) - these need processing even without other attributes */
|
|
599
|
+
bool has_rowspan_markers = (strstr(html, "^^") != NULL);
|
|
600
|
+
/* Also check for empty first header cell - this indicates row-header detection is needed */
|
|
601
|
+
bool needs_row_header_detection = (strstr(html, "<thead>") != NULL);
|
|
602
|
+
if (!has_alignment_colons && !has_rowspan_markers && !needs_row_header_detection) {
|
|
603
|
+
/* No alignment colons, rowspan markers, or row-header detection needed - return early */
|
|
604
|
+
return (char *)html;
|
|
605
|
+
}
|
|
606
|
+
} else {
|
|
607
|
+
/* If we need all cells, check for alignment colons anyway to know if we should process them */
|
|
608
|
+
/* But first check if most cells already have align attributes (cmark-gfm already processed them) */
|
|
609
|
+
/* If so, we can skip alignment processing entirely */
|
|
610
|
+
const char *align_attr_count = html;
|
|
611
|
+
int cells_with_align = 0;
|
|
612
|
+
int total_cells_checked = 0;
|
|
613
|
+
while ((align_attr_count = strstr(align_attr_count, "align=")) != NULL && total_cells_checked < 100) {
|
|
614
|
+
cells_with_align++;
|
|
615
|
+
total_cells_checked++;
|
|
616
|
+
align_attr_count += 6; /* Skip past "align=" */
|
|
617
|
+
}
|
|
618
|
+
/* If most cells (>=80%) already have align attributes, skip alignment processing */
|
|
619
|
+
if (total_cells_checked >= 20 && cells_with_align * 100 / total_cells_checked >= 80) {
|
|
620
|
+
has_alignment_colons = false; /* Skip alignment processing - already handled by cmark-gfm */
|
|
621
|
+
} else {
|
|
622
|
+
has_alignment_colons = (strstr(html, ":</td>") != NULL || strstr(html, ":</th>") != NULL);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/* Collect all cells (for mapping calculation) - only needed for attribute processing, not alignment */
|
|
627
|
+
all_cell *all_cells = NULL;
|
|
628
|
+
if (needs_all_cells) {
|
|
629
|
+
/* IMPORTANT: Always process if we have ANY attributes, even if they seem "simple".
|
|
630
|
+
* This ensures rowspan/colspan processing isn't skipped. Only skip if we truly
|
|
631
|
+
* have nothing to process (no attributes, no captions, no tfoot, no alignment). */
|
|
632
|
+
if (attrs == NULL && captions == NULL && tfoot_rows == NULL && !has_alignment_colons) {
|
|
633
|
+
return (char *)html;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
all_cells = collect_all_cells(document);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
/* Allocate output buffer (same size as input, we'll realloc if needed) */
|
|
640
|
+
size_t capacity = strlen(html) * 2;
|
|
641
|
+
char *output = malloc(capacity);
|
|
642
|
+
if (!output) {
|
|
643
|
+
/* Clean up */
|
|
644
|
+
while (all_cells) {
|
|
645
|
+
all_cell *next = all_cells->next;
|
|
646
|
+
free(all_cells);
|
|
647
|
+
all_cells = next;
|
|
648
|
+
}
|
|
649
|
+
while (attrs) {
|
|
650
|
+
cell_attr *next = attrs->next;
|
|
651
|
+
free(attrs->attributes);
|
|
652
|
+
if (attrs->cell_text) free(attrs->cell_text);
|
|
653
|
+
free(attrs);
|
|
654
|
+
attrs = next;
|
|
655
|
+
}
|
|
656
|
+
while (captions) {
|
|
657
|
+
table_caption *next = captions->next;
|
|
658
|
+
free(captions->caption);
|
|
659
|
+
free(captions);
|
|
660
|
+
captions = next;
|
|
661
|
+
}
|
|
662
|
+
while (paras_to_remove) {
|
|
663
|
+
para_to_remove *next = paras_to_remove->next;
|
|
664
|
+
free(paras_to_remove->text_fingerprint);
|
|
665
|
+
free(paras_to_remove);
|
|
666
|
+
paras_to_remove = next;
|
|
667
|
+
}
|
|
668
|
+
return (char *)html;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
const char *read = html;
|
|
672
|
+
char *write = output;
|
|
673
|
+
size_t written = 0;
|
|
674
|
+
int table_idx = -1; /* Track which table we're in */
|
|
675
|
+
int row_idx = -1;
|
|
676
|
+
int ast_row_idx = -1; /* AST row index corresponding to current HTML row */
|
|
677
|
+
int col_idx = 0;
|
|
678
|
+
int para_idx = -1; /* Track paragraph index */
|
|
679
|
+
bool in_table = false;
|
|
680
|
+
bool in_row = false;
|
|
681
|
+
bool in_tbody = false; /* Track if we're currently in tbody */
|
|
682
|
+
bool in_tfoot = false; /* Track if we're currently in tfoot */
|
|
683
|
+
bool in_thead = false; /* Track if we're currently in thead */
|
|
684
|
+
bool current_row_is_tfoot = false; /* Track if current row is a tfoot row */
|
|
685
|
+
|
|
686
|
+
/* Pre-calculated mapping for current row: HTML position -> original column index */
|
|
687
|
+
int row_col_mapping[50]; /* row_col_mapping[html_pos] = original_col_index */
|
|
688
|
+
int row_col_mapping_size = 0;
|
|
689
|
+
|
|
690
|
+
/* Track if we should process alignment (only if alignment colons were detected) */
|
|
691
|
+
bool should_process_alignment = has_alignment_colons;
|
|
692
|
+
|
|
693
|
+
/* Track active rowspan cells per column (inspired by Jekyll Spaceship approach).
|
|
694
|
+
* active_rowspan_cells[col] points to the cell_attr for the cell that's currently
|
|
695
|
+
* being rowspanned in that column. When we see a ^^ cell, we increment its rowspan. */
|
|
696
|
+
cell_attr *active_rowspan_cells[50]; /* One per column */
|
|
697
|
+
for (int i = 0; i < 50; i++) active_rowspan_cells[i] = NULL;
|
|
698
|
+
|
|
699
|
+
/* Track tables that have an explicit row-header first column.
|
|
700
|
+
* Detected when the first header row's first cell is empty (| | Header ...). */
|
|
701
|
+
bool table_has_row_header_first_col[50];
|
|
702
|
+
for (int i = 0; i < 50; i++) table_has_row_header_first_col[i] = false;
|
|
703
|
+
|
|
704
|
+
/* Track the previous cell's matching status to check for colspan */
|
|
705
|
+
cell_attr *prev_cell_matching = NULL;
|
|
706
|
+
|
|
707
|
+
/* Timeout check: if processing takes more than 10 seconds, skip the rest */
|
|
708
|
+
time_t start_time = time(NULL);
|
|
709
|
+
const time_t timeout_seconds = 10;
|
|
710
|
+
size_t timeout_check_counter = 0;
|
|
711
|
+
|
|
712
|
+
while (*read) {
|
|
713
|
+
timeout_check_counter++;
|
|
714
|
+
|
|
715
|
+
/* Check timeout every 1000 characters */
|
|
716
|
+
if (timeout_check_counter % 1000 == 0) {
|
|
717
|
+
time_t current_time = time(NULL);
|
|
718
|
+
if (current_time - start_time >= timeout_seconds) {
|
|
719
|
+
/* Copy remaining HTML as-is to avoid corruption */
|
|
720
|
+
while (*read) {
|
|
721
|
+
*write++ = *read++;
|
|
722
|
+
}
|
|
723
|
+
*write = '\0';
|
|
724
|
+
goto done;
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
/* Ensure we have space (realloc if needed) */
|
|
728
|
+
if (written + 100 > capacity) {
|
|
729
|
+
capacity *= 2;
|
|
730
|
+
char *new_output = realloc(output, capacity);
|
|
731
|
+
if (!new_output) break;
|
|
732
|
+
output = new_output;
|
|
733
|
+
write = output + written;
|
|
734
|
+
}
|
|
735
|
+
/* Track table structure (BEFORE cell processing so indices are correct) */
|
|
736
|
+
/* Also fix missing space in table tag (e.g., <tableid= -> <table id=) */
|
|
737
|
+
if (strncmp(read, "<table", 6) == 0 && (read[6] == '>' || read[6] == ' ' || (read[6] == 'i' && strncmp(read + 6, "id=", 3) == 0) || isalnum((unsigned char)read[6]))) {
|
|
738
|
+
in_table = true;
|
|
739
|
+
table_idx++; /* New table */
|
|
740
|
+
row_idx = -1; /* Reset for each new table */
|
|
741
|
+
in_thead = false;
|
|
742
|
+
|
|
743
|
+
/* Check if this table has a caption (before fixing spacing so we can add figure tag) */
|
|
744
|
+
table_caption *cap = NULL;
|
|
745
|
+
for (table_caption *c = captions; c; c = c->next) {
|
|
746
|
+
if (c->table_index == table_idx) {
|
|
747
|
+
cap = c;
|
|
748
|
+
break;
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
/* Fix missing space before id or class attributes */
|
|
753
|
+
if (read[6] == 'i' && strncmp(read + 6, "id=", 3) == 0) {
|
|
754
|
+
/* If we have a caption and it should be above, write figure tag and caption first */
|
|
755
|
+
if (cap && caption_position == 0) {
|
|
756
|
+
/* Write <figure><figcaption>caption</figcaption> */
|
|
757
|
+
const char *fig_open = "<figure class=\"table-figure\">\n<figcaption>";
|
|
758
|
+
size_t fig_open_len = strlen(fig_open);
|
|
759
|
+
const char *fig_close_cap = "</figcaption>\n";
|
|
760
|
+
size_t fig_close_cap_len = strlen(fig_close_cap);
|
|
761
|
+
|
|
762
|
+
/* Ensure we have space */
|
|
763
|
+
while (written + fig_open_len + strlen(cap->caption) * 6 + fig_close_cap_len + 100 > capacity) {
|
|
764
|
+
capacity *= 2;
|
|
765
|
+
char *new_output = realloc(output, capacity);
|
|
766
|
+
if (!new_output) break;
|
|
767
|
+
output = new_output;
|
|
768
|
+
write = output + written;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
memcpy(write, fig_open, fig_open_len);
|
|
772
|
+
write += fig_open_len;
|
|
773
|
+
written += fig_open_len;
|
|
774
|
+
|
|
775
|
+
/* Write caption text (escape HTML entities if needed) */
|
|
776
|
+
const char *cap_text = cap->caption;
|
|
777
|
+
while (*cap_text) {
|
|
778
|
+
if (*cap_text == '&') {
|
|
779
|
+
const char *amp = "&";
|
|
780
|
+
memcpy(write, amp, 5);
|
|
781
|
+
write += 5;
|
|
782
|
+
written += 5;
|
|
783
|
+
} else if (*cap_text == '<') {
|
|
784
|
+
const char *lt = "<";
|
|
785
|
+
memcpy(write, lt, 4);
|
|
786
|
+
write += 4;
|
|
787
|
+
written += 4;
|
|
788
|
+
} else if (*cap_text == '>') {
|
|
789
|
+
const char *gt = ">";
|
|
790
|
+
memcpy(write, gt, 4);
|
|
791
|
+
write += 4;
|
|
792
|
+
written += 4;
|
|
793
|
+
} else if (*cap_text == '"') {
|
|
794
|
+
const char *quot = """;
|
|
795
|
+
memcpy(write, quot, 6);
|
|
796
|
+
write += 6;
|
|
797
|
+
written += 6;
|
|
798
|
+
} else {
|
|
799
|
+
*write++ = *cap_text;
|
|
800
|
+
written++;
|
|
801
|
+
}
|
|
802
|
+
cap_text++;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
memcpy(write, fig_close_cap, fig_close_cap_len);
|
|
806
|
+
write += fig_close_cap_len;
|
|
807
|
+
written += fig_close_cap_len;
|
|
808
|
+
} else if (cap && caption_position == 1) {
|
|
809
|
+
/* Caption below - just open figure tag */
|
|
810
|
+
const char *fig_open = "<figure class=\"table-figure\">\n";
|
|
811
|
+
size_t fig_open_len = strlen(fig_open);
|
|
812
|
+
while (written + fig_open_len + 100 > capacity) {
|
|
813
|
+
capacity *= 2;
|
|
814
|
+
char *new_output = realloc(output, capacity);
|
|
815
|
+
if (!new_output) break;
|
|
816
|
+
output = new_output;
|
|
817
|
+
write = output + written;
|
|
818
|
+
}
|
|
819
|
+
memcpy(write, fig_open, fig_open_len);
|
|
820
|
+
write += fig_open_len;
|
|
821
|
+
written += fig_open_len;
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
/* Write "<table " then copy the rest of the tag */
|
|
825
|
+
memcpy(write, read, 6);
|
|
826
|
+
write += 6;
|
|
827
|
+
*write++ = ' ';
|
|
828
|
+
written += 7;
|
|
829
|
+
read += 6;
|
|
830
|
+
/* Copy the rest of the tag until closing > */
|
|
831
|
+
while (*read && *read != '>') {
|
|
832
|
+
*write++ = *read++;
|
|
833
|
+
written++;
|
|
834
|
+
}
|
|
835
|
+
if (*read == '>') {
|
|
836
|
+
*write++ = *read++;
|
|
837
|
+
written++;
|
|
838
|
+
}
|
|
839
|
+
continue; /* Skip the normal copy below, we've handled it */
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
/* Normal table tag (no spacing fix needed) - check for caption */
|
|
843
|
+
if (cap) {
|
|
844
|
+
if (caption_position == 0) {
|
|
845
|
+
/* Caption above - write <figure><figcaption>caption</figcaption> */
|
|
846
|
+
const char *fig_open = "<figure class=\"table-figure\">\n<figcaption>";
|
|
847
|
+
size_t fig_open_len = strlen(fig_open);
|
|
848
|
+
const char *fig_close_cap = "</figcaption>\n";
|
|
849
|
+
size_t fig_close_cap_len = strlen(fig_close_cap);
|
|
850
|
+
|
|
851
|
+
/* Ensure we have space */
|
|
852
|
+
while (written + fig_open_len + strlen(cap->caption) * 6 + fig_close_cap_len + 100 > capacity) {
|
|
853
|
+
capacity *= 2;
|
|
854
|
+
char *new_output = realloc(output, capacity);
|
|
855
|
+
if (!new_output) break;
|
|
856
|
+
output = new_output;
|
|
857
|
+
write = output + written;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
memcpy(write, fig_open, fig_open_len);
|
|
861
|
+
write += fig_open_len;
|
|
862
|
+
written += fig_open_len;
|
|
863
|
+
|
|
864
|
+
/* Write caption text (escape HTML entities if needed) */
|
|
865
|
+
const char *cap_text = cap->caption;
|
|
866
|
+
while (*cap_text) {
|
|
867
|
+
if (*cap_text == '&') {
|
|
868
|
+
const char *amp = "&";
|
|
869
|
+
memcpy(write, amp, 5);
|
|
870
|
+
write += 5;
|
|
871
|
+
written += 5;
|
|
872
|
+
} else if (*cap_text == '<') {
|
|
873
|
+
const char *lt = "<";
|
|
874
|
+
memcpy(write, lt, 4);
|
|
875
|
+
write += 4;
|
|
876
|
+
written += 4;
|
|
877
|
+
} else if (*cap_text == '>') {
|
|
878
|
+
const char *gt = ">";
|
|
879
|
+
memcpy(write, gt, 4);
|
|
880
|
+
write += 4;
|
|
881
|
+
written += 4;
|
|
882
|
+
} else if (*cap_text == '"') {
|
|
883
|
+
const char *quot = """;
|
|
884
|
+
memcpy(write, quot, 6);
|
|
885
|
+
write += 6;
|
|
886
|
+
written += 6;
|
|
887
|
+
} else {
|
|
888
|
+
*write++ = *cap_text;
|
|
889
|
+
written++;
|
|
890
|
+
}
|
|
891
|
+
cap_text++;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
memcpy(write, fig_close_cap, fig_close_cap_len);
|
|
895
|
+
write += fig_close_cap_len;
|
|
896
|
+
written += fig_close_cap_len;
|
|
897
|
+
} else {
|
|
898
|
+
/* Caption below - just open figure tag */
|
|
899
|
+
const char *fig_open = "<figure class=\"table-figure\">\n";
|
|
900
|
+
size_t fig_open_len = strlen(fig_open);
|
|
901
|
+
while (written + fig_open_len + 100 > capacity) {
|
|
902
|
+
capacity *= 2;
|
|
903
|
+
char *new_output = realloc(output, capacity);
|
|
904
|
+
if (!new_output) break;
|
|
905
|
+
output = new_output;
|
|
906
|
+
write = output + written;
|
|
907
|
+
}
|
|
908
|
+
memcpy(write, fig_open, fig_open_len);
|
|
909
|
+
write += fig_open_len;
|
|
910
|
+
written += fig_open_len;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
} else if (strncmp(read, "</table>", 8) == 0) {
|
|
914
|
+
/* Close tfoot or tbody if still open */
|
|
915
|
+
if (in_tfoot) {
|
|
916
|
+
const char *tfoot_close = "</tfoot>\n";
|
|
917
|
+
size_t tfoot_close_len = strlen(tfoot_close);
|
|
918
|
+
while (written + tfoot_close_len > capacity) {
|
|
919
|
+
capacity *= 2;
|
|
920
|
+
char *new_output = realloc(output, capacity);
|
|
921
|
+
if (!new_output) break;
|
|
922
|
+
output = new_output;
|
|
923
|
+
write = output + written;
|
|
924
|
+
}
|
|
925
|
+
memcpy(write, tfoot_close, tfoot_close_len);
|
|
926
|
+
write += tfoot_close_len;
|
|
927
|
+
written += tfoot_close_len;
|
|
928
|
+
in_tfoot = false;
|
|
929
|
+
} else if (in_tbody) {
|
|
930
|
+
const char *tbody_close = "</tbody>\n";
|
|
931
|
+
size_t tbody_close_len = strlen(tbody_close);
|
|
932
|
+
while (written + tbody_close_len > capacity) {
|
|
933
|
+
capacity *= 2;
|
|
934
|
+
char *new_output = realloc(output, capacity);
|
|
935
|
+
if (!new_output) break;
|
|
936
|
+
output = new_output;
|
|
937
|
+
write = output + written;
|
|
938
|
+
}
|
|
939
|
+
memcpy(write, tbody_close, tbody_close_len);
|
|
940
|
+
write += tbody_close_len;
|
|
941
|
+
written += tbody_close_len;
|
|
942
|
+
in_tbody = false;
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
/* Check if this table had a caption */
|
|
946
|
+
table_caption *cap = NULL;
|
|
947
|
+
for (table_caption *c = captions; c; c = c->next) {
|
|
948
|
+
if (c->table_index == table_idx) {
|
|
949
|
+
cap = c;
|
|
950
|
+
break;
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
if (cap) {
|
|
955
|
+
/* Write </table> first */
|
|
956
|
+
memcpy(write, read, 8);
|
|
957
|
+
write += 8;
|
|
958
|
+
read += 8;
|
|
959
|
+
written += 8;
|
|
960
|
+
|
|
961
|
+
if (caption_position == 1) {
|
|
962
|
+
/* Caption below - write <figcaption>caption</figcaption> before </figure> */
|
|
963
|
+
const char *fig_cap_open = "<figcaption>";
|
|
964
|
+
const char *fig_cap_close = "</figcaption>\n";
|
|
965
|
+
size_t fig_cap_open_len = strlen(fig_cap_open);
|
|
966
|
+
size_t fig_cap_close_len = strlen(fig_cap_close);
|
|
967
|
+
|
|
968
|
+
/* Ensure we have space */
|
|
969
|
+
while (written + fig_cap_open_len + strlen(cap->caption) * 6 + fig_cap_close_len + 100 > capacity) {
|
|
970
|
+
capacity *= 2;
|
|
971
|
+
char *new_output = realloc(output, capacity);
|
|
972
|
+
if (!new_output) break;
|
|
973
|
+
output = new_output;
|
|
974
|
+
write = output + written;
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
memcpy(write, fig_cap_open, fig_cap_open_len);
|
|
978
|
+
write += fig_cap_open_len;
|
|
979
|
+
written += fig_cap_open_len;
|
|
980
|
+
|
|
981
|
+
/* Write caption text (escape HTML entities if needed) */
|
|
982
|
+
const char *cap_text = cap->caption;
|
|
983
|
+
while (*cap_text) {
|
|
984
|
+
if (*cap_text == '&') {
|
|
985
|
+
const char *amp = "&";
|
|
986
|
+
memcpy(write, amp, 5);
|
|
987
|
+
write += 5;
|
|
988
|
+
written += 5;
|
|
989
|
+
} else if (*cap_text == '<') {
|
|
990
|
+
const char *lt = "<";
|
|
991
|
+
memcpy(write, lt, 4);
|
|
992
|
+
write += 4;
|
|
993
|
+
written += 4;
|
|
994
|
+
} else if (*cap_text == '>') {
|
|
995
|
+
const char *gt = ">";
|
|
996
|
+
memcpy(write, gt, 4);
|
|
997
|
+
write += 4;
|
|
998
|
+
written += 4;
|
|
999
|
+
} else if (*cap_text == '"') {
|
|
1000
|
+
const char *quot = """;
|
|
1001
|
+
memcpy(write, quot, 6);
|
|
1002
|
+
write += 6;
|
|
1003
|
+
written += 6;
|
|
1004
|
+
} else {
|
|
1005
|
+
*write++ = *cap_text;
|
|
1006
|
+
written++;
|
|
1007
|
+
}
|
|
1008
|
+
cap_text++;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
memcpy(write, fig_cap_close, fig_cap_close_len);
|
|
1012
|
+
write += fig_cap_close_len;
|
|
1013
|
+
written += fig_cap_close_len;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
/* Close </figure> */
|
|
1017
|
+
const char *fig_close = "</figure>\n";
|
|
1018
|
+
size_t fig_close_len = strlen(fig_close);
|
|
1019
|
+
|
|
1020
|
+
/* Ensure we have space */
|
|
1021
|
+
while (written + fig_close_len + 100 > capacity) {
|
|
1022
|
+
capacity *= 2;
|
|
1023
|
+
char *new_output = realloc(output, capacity);
|
|
1024
|
+
if (!new_output) break;
|
|
1025
|
+
output = new_output;
|
|
1026
|
+
write = output + written;
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
memcpy(write, fig_close, fig_close_len);
|
|
1030
|
+
write += fig_close_len;
|
|
1031
|
+
written += fig_close_len;
|
|
1032
|
+
|
|
1033
|
+
in_table = false;
|
|
1034
|
+
continue; /* Skip the normal copy below */
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
in_table = false;
|
|
1038
|
+
} else if (in_table && strncmp(read, "<thead>", 7) == 0) {
|
|
1039
|
+
in_thead = true;
|
|
1040
|
+
} else if (in_table && strncmp(read, "</thead>", 8) == 0) {
|
|
1041
|
+
in_thead = false;
|
|
1042
|
+
} else if (in_table && strncmp(read, "<tbody>", 7) == 0) {
|
|
1043
|
+
/* We're entering tbody - mark it */
|
|
1044
|
+
in_tbody = true;
|
|
1045
|
+
in_tfoot = false;
|
|
1046
|
+
} else if (in_table && strncmp(read, "</tbody>", 8) == 0) {
|
|
1047
|
+
/* We're leaving tbody - but only if we haven't already closed it for tfoot */
|
|
1048
|
+
if (!in_tfoot) {
|
|
1049
|
+
in_tbody = false;
|
|
1050
|
+
} else {
|
|
1051
|
+
/* We're already in tfoot, so skip this </tbody> tag (we already closed it) */
|
|
1052
|
+
read += 8;
|
|
1053
|
+
continue;
|
|
1054
|
+
}
|
|
1055
|
+
} else if (in_table && strncmp(read, "<tr>", 4) == 0) {
|
|
1056
|
+
row_idx++;
|
|
1057
|
+
col_idx = 0;
|
|
1058
|
+
prev_cell_matching = NULL; /* Reset previous cell matching for new row */
|
|
1059
|
+
|
|
1060
|
+
/* Map HTML row index to AST row index.
|
|
1061
|
+
* HTML rows skip separator rows (which are marked for removal in AST).
|
|
1062
|
+
* So we need to find the AST row that corresponds to this HTML row.
|
|
1063
|
+
*
|
|
1064
|
+
* IMPORTANT: row_idx includes ALL <tr> tags, including the header in <thead>.
|
|
1065
|
+
* But the header row is handled separately, so for tbody/tfoot rows, we need to
|
|
1066
|
+
* account for that. The header is typically AST row 0, and it's the first HTML row.
|
|
1067
|
+
* So for tbody/tfoot rows, row_idx will be >= 1 (1 = first data row, 2 = second data row, etc.).
|
|
1068
|
+
*
|
|
1069
|
+
* The mapping should count all non-removed rows, including the header, to match row_idx.
|
|
1070
|
+
* Note: This mapping is only needed when we have attributes to process. */
|
|
1071
|
+
ast_row_idx = -1;
|
|
1072
|
+
if (all_cells) {
|
|
1073
|
+
int html_row_count = -1; /* Start at -1, will be 0 for header */
|
|
1074
|
+
for (int r = 0; r < 100; r++) { /* Check up to 100 AST rows */
|
|
1075
|
+
/* Check if this AST row has any non-removed cells */
|
|
1076
|
+
bool has_non_removed = false;
|
|
1077
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1078
|
+
if (c->table_index == table_idx &&
|
|
1079
|
+
c->row_index == r &&
|
|
1080
|
+
!c->is_removed) {
|
|
1081
|
+
has_non_removed = true;
|
|
1082
|
+
break;
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
/* If this AST row has non-removed cells, it appears in HTML */
|
|
1086
|
+
if (has_non_removed) {
|
|
1087
|
+
html_row_count++;
|
|
1088
|
+
if (html_row_count == row_idx) {
|
|
1089
|
+
ast_row_idx = r;
|
|
1090
|
+
break;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
} else {
|
|
1095
|
+
/* No all_cells available (alignment-only processing) - use row_idx directly */
|
|
1096
|
+
ast_row_idx = row_idx;
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
if (ast_row_idx == -1) {
|
|
1100
|
+
/* Fallback: use row_idx directly */
|
|
1101
|
+
ast_row_idx = row_idx;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
/* Pre-calculate which original columns will be visible in this row's HTML.
|
|
1105
|
+
* This creates a mapping: HTML position -> original column index.
|
|
1106
|
+
* IMPORTANT: Only include columns that render NEW <td> tags in this HTML row.
|
|
1107
|
+
* Columns occupied by rowspans from previous rows should NOT be included,
|
|
1108
|
+
* because they don't generate new <td> tags in this row. */
|
|
1109
|
+
row_col_mapping_size = 0;
|
|
1110
|
+
|
|
1111
|
+
/* For each original column, check if it renders a NEW cell in this HTML row */
|
|
1112
|
+
for (int orig_col = 0; orig_col < 50 && row_col_mapping_size < 50; orig_col++) {
|
|
1113
|
+
bool renders_new_cell = false;
|
|
1114
|
+
|
|
1115
|
+
/* Check if this column has a non-removed cell in current AST row */
|
|
1116
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1117
|
+
if (c->table_index == table_idx &&
|
|
1118
|
+
c->row_index == ast_row_idx &&
|
|
1119
|
+
c->col_index == orig_col &&
|
|
1120
|
+
!c->is_removed) {
|
|
1121
|
+
/* This column has a new cell in the current row */
|
|
1122
|
+
renders_new_cell = true;
|
|
1123
|
+
break;
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
/* Check if this column is covered by an active rowspan from a previous row.
|
|
1128
|
+
* If a previous row has a cell with rowspan that's still active (spans to this row),
|
|
1129
|
+
* then this column won't render a new <td> tag in this HTML row. */
|
|
1130
|
+
bool covered_by_rowspan = false;
|
|
1131
|
+
if (renders_new_cell && row_idx > 0) {
|
|
1132
|
+
/* Check all previous HTML rows to see if any have a rowspan covering this column */
|
|
1133
|
+
for (int prev_html_row = 0; prev_html_row < row_idx; prev_html_row++) {
|
|
1134
|
+
/* Map previous HTML row to AST row */
|
|
1135
|
+
int prev_ast_row = -1;
|
|
1136
|
+
int prev_html_row_count = -1;
|
|
1137
|
+
for (int r = 0; r < 100; r++) {
|
|
1138
|
+
bool has_non_removed = false;
|
|
1139
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1140
|
+
if (c->table_index == table_idx &&
|
|
1141
|
+
c->row_index == r &&
|
|
1142
|
+
!c->is_removed) {
|
|
1143
|
+
has_non_removed = true;
|
|
1144
|
+
break;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
if (has_non_removed) {
|
|
1148
|
+
prev_html_row_count++;
|
|
1149
|
+
if (prev_html_row_count == prev_html_row) {
|
|
1150
|
+
prev_ast_row = r;
|
|
1151
|
+
break;
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
if (prev_ast_row >= 0) {
|
|
1156
|
+
/* Check if this AST row has a cell at orig_col with rowspan that spans to current row */
|
|
1157
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1158
|
+
if (a->table_index == table_idx &&
|
|
1159
|
+
a->row_index == prev_ast_row &&
|
|
1160
|
+
a->col_index == orig_col &&
|
|
1161
|
+
strstr(a->attributes, "rowspan=")) {
|
|
1162
|
+
int rowspan_val = 1;
|
|
1163
|
+
sscanf(strstr(a->attributes, "rowspan="), "rowspan=\"%d\"", &rowspan_val);
|
|
1164
|
+
/* Check if this rowspan spans to the current HTML row */
|
|
1165
|
+
int rows_spanned = row_idx - prev_html_row;
|
|
1166
|
+
if (rows_spanned < rowspan_val) {
|
|
1167
|
+
covered_by_rowspan = true;
|
|
1168
|
+
break;
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
if (covered_by_rowspan) break;
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
/* Do NOT include columns occupied by rowspans from previous rows,
|
|
1178
|
+
* because they don't generate new <td> tags in this HTML row */
|
|
1179
|
+
|
|
1180
|
+
if (renders_new_cell && !covered_by_rowspan) {
|
|
1181
|
+
row_col_mapping[row_col_mapping_size++] = orig_col;
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
/* Store the mapping in a way we can access it during cell processing */
|
|
1186
|
+
/* We'll use a simple approach: store it in a static array keyed by table_idx and row_idx */
|
|
1187
|
+
/* Actually, let's use a simpler approach: process cells immediately using this mapping */
|
|
1188
|
+
|
|
1189
|
+
/* Note: Captions immediately following tables (with no blank line) are not supported.
|
|
1190
|
+
* When a caption like [Caption] appears on the line immediately after a table, cmark-gfm
|
|
1191
|
+
* parses it as a table row rather than a paragraph, making it difficult to detect and extract
|
|
1192
|
+
* reliably. Captions work correctly when:
|
|
1193
|
+
* - They appear before the table (with or without blank line)
|
|
1194
|
+
* - They appear after the table with a blank line between (parsed as a paragraph)
|
|
1195
|
+
* To use a caption after a table, include a blank line between the table and caption.
|
|
1196
|
+
*/
|
|
1197
|
+
|
|
1198
|
+
/* Check if this row should be in tfoot.
|
|
1199
|
+
* tfoot_rows are stored with AST row indices, so we need to check ast_row_idx.
|
|
1200
|
+
*
|
|
1201
|
+
* CRITICAL: A row should only be in tfoot if it comes AFTER the === row in HTML.
|
|
1202
|
+
* Even if a row is marked as tfoot in the AST, if it appears before the === row
|
|
1203
|
+
* in the HTML output (because the === row is skipped), it must be in tbody. */
|
|
1204
|
+
current_row_is_tfoot = false;
|
|
1205
|
+
|
|
1206
|
+
/* First, find the === row's AST index */
|
|
1207
|
+
int min_equals_row_idx = -1;
|
|
1208
|
+
for (int r = 0; r < 100; r++) {
|
|
1209
|
+
int eq_total = 0;
|
|
1210
|
+
int eq_removed = 0;
|
|
1211
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1212
|
+
if (c->table_index == table_idx && c->row_index == r) {
|
|
1213
|
+
eq_total++;
|
|
1214
|
+
if (c->is_removed) eq_removed++;
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
if (eq_total > 0 && eq_total == eq_removed) {
|
|
1218
|
+
min_equals_row_idx = r;
|
|
1219
|
+
break; /* Found the first === row */
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
/* Check if this row is marked as tfoot in the AST */
|
|
1224
|
+
bool is_marked_tfoot = false;
|
|
1225
|
+
for (tfoot_row *t = tfoot_rows; t; t = t->next) {
|
|
1226
|
+
if (t->table_index == table_idx && t->row_index == ast_row_idx) {
|
|
1227
|
+
is_marked_tfoot = true;
|
|
1228
|
+
break;
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
/* CRITICAL: Before checking tfoot marking, verify HTML position.
|
|
1233
|
+
* If this row appears before the === row in HTML, it MUST be in tbody,
|
|
1234
|
+
* regardless of AST marking. This check runs even if the row is not marked as tfoot,
|
|
1235
|
+
* to handle cases where the row might be incorrectly processed. */
|
|
1236
|
+
if (min_equals_row_idx >= 0) {
|
|
1237
|
+
/* Calculate how many HTML rows appear before the === row */
|
|
1238
|
+
int html_rows_before_equals = -1; /* Start at -1, will be 0 for header */
|
|
1239
|
+
for (int r = 0; r < 100 && r <= min_equals_row_idx; r++) {
|
|
1240
|
+
bool has_non_removed = false;
|
|
1241
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1242
|
+
if (c->table_index == table_idx &&
|
|
1243
|
+
c->row_index == r &&
|
|
1244
|
+
!c->is_removed) {
|
|
1245
|
+
has_non_removed = true;
|
|
1246
|
+
break;
|
|
1247
|
+
}
|
|
1248
|
+
}
|
|
1249
|
+
if (has_non_removed) {
|
|
1250
|
+
html_rows_before_equals++;
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
/* If this row's HTML position is before the === row, force it to tbody.
|
|
1255
|
+
* Since the === row is skipped, rows with row_idx <= html_rows_before_equals + 1
|
|
1256
|
+
* appear before the === row in HTML.
|
|
1257
|
+
*
|
|
1258
|
+
* CRITICAL: We must set current_row_is_tfoot = false BEFORE the skip check,
|
|
1259
|
+
* so that rows forced to tbody are not skipped. */
|
|
1260
|
+
if (html_rows_before_equals >= 0 && row_idx <= html_rows_before_equals + 1) {
|
|
1261
|
+
current_row_is_tfoot = false;
|
|
1262
|
+
} else if (ast_row_idx >= 0 && ast_row_idx <= min_equals_row_idx) {
|
|
1263
|
+
/* Also check AST position as a fallback */
|
|
1264
|
+
current_row_is_tfoot = false;
|
|
1265
|
+
}
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
/* If marked as tfoot, verify it actually comes after === in HTML */
|
|
1269
|
+
if (is_marked_tfoot && min_equals_row_idx >= 0) {
|
|
1270
|
+
/* Calculate how many HTML rows appear before the === row.
|
|
1271
|
+
* Count non-removed AST rows up to min_equals_row_idx.
|
|
1272
|
+
* This gives us the HTML row index of the last row that appears BEFORE the === row. */
|
|
1273
|
+
int html_rows_before_equals = -1; /* Start at -1, will be 0 for header */
|
|
1274
|
+
for (int r = 0; r < 100 && r <= min_equals_row_idx; r++) {
|
|
1275
|
+
bool has_non_removed = false;
|
|
1276
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1277
|
+
if (c->table_index == table_idx &&
|
|
1278
|
+
c->row_index == r &&
|
|
1279
|
+
!c->is_removed) {
|
|
1280
|
+
has_non_removed = true;
|
|
1281
|
+
break;
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
if (has_non_removed) {
|
|
1285
|
+
html_rows_before_equals++;
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
/* CRITICAL: The issue is that rows with AST index > min_equals_row_idx can still
|
|
1290
|
+
* appear in HTML before the === row (since === is skipped). So we need to check
|
|
1291
|
+
* if this row's HTML position (row_idx) is <= html_rows_before_equals.
|
|
1292
|
+
*
|
|
1293
|
+
* The key insight: html_rows_before_equals is the HTML row index of the last row
|
|
1294
|
+
* that appears BEFORE the === row. So if row_idx <= html_rows_before_equals,
|
|
1295
|
+
* this row appears before === in HTML, so it must be in tbody.
|
|
1296
|
+
*
|
|
1297
|
+
* But we also need to check AST position as a fallback, because the HTML position
|
|
1298
|
+
* calculation might be off if the row mapping is wrong. */
|
|
1299
|
+
bool force_to_tbody = false;
|
|
1300
|
+
|
|
1301
|
+
/* First check: AST position */
|
|
1302
|
+
if (ast_row_idx <= min_equals_row_idx) {
|
|
1303
|
+
/* AST says it's before or at ===, so it must be in tbody */
|
|
1304
|
+
force_to_tbody = true;
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
/* Second check: HTML position
|
|
1308
|
+
* Since the === row is skipped in HTML, rows with row_idx <= html_rows_before_equals + 2
|
|
1309
|
+
* appear before the === row in HTML. The +2 accounts for:
|
|
1310
|
+
* - html_rows_before_equals is the count of rows before === (including header)
|
|
1311
|
+
* - row_idx is 1-based (1=header, 2=first data, 3=second data, 4===, 5=footer)
|
|
1312
|
+
* - So row_idx <= html_rows_before_equals + 2 covers the first two data rows. */
|
|
1313
|
+
if (html_rows_before_equals >= 0 && row_idx <= html_rows_before_equals + 2) {
|
|
1314
|
+
/* HTML position says it's before ===, so it must be in tbody */
|
|
1315
|
+
force_to_tbody = true;
|
|
1316
|
+
}
|
|
1317
|
+
|
|
1318
|
+
/* Third check: If row_idx <= 3, it's definitely in tbody (header + first two data rows) */
|
|
1319
|
+
if (row_idx <= 3) {
|
|
1320
|
+
force_to_tbody = true;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
if (force_to_tbody) {
|
|
1324
|
+
current_row_is_tfoot = false;
|
|
1325
|
+
} else {
|
|
1326
|
+
/* AST and HTML both say it's after ===, so mark as tfoot */
|
|
1327
|
+
current_row_is_tfoot = true;
|
|
1328
|
+
}
|
|
1329
|
+
} else if (is_marked_tfoot) {
|
|
1330
|
+
/* No === row found, but row is marked as tfoot - use AST marking */
|
|
1331
|
+
/* BUT: Even if no === row is found, if this row's HTML position suggests
|
|
1332
|
+
* it should be in tbody (e.g., it's one of the first few rows), don't mark as tfoot.
|
|
1333
|
+
* This handles edge cases where the === row might not be detected correctly. */
|
|
1334
|
+
if (row_idx <= 2) {
|
|
1335
|
+
/* If this is one of the first few rows, it's probably in tbody */
|
|
1336
|
+
current_row_is_tfoot = false;
|
|
1337
|
+
} else {
|
|
1338
|
+
current_row_is_tfoot = true;
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
/* If we're in tbody and this is a tfoot row, close tbody and open tfoot */
|
|
1343
|
+
if (current_row_is_tfoot && in_tbody && !in_tfoot) {
|
|
1344
|
+
/* Close tbody */
|
|
1345
|
+
const char *tbody_close = "</tbody>\n";
|
|
1346
|
+
size_t tbody_close_len = strlen(tbody_close);
|
|
1347
|
+
while (written + tbody_close_len > capacity) {
|
|
1348
|
+
capacity *= 2;
|
|
1349
|
+
char *new_output = realloc(output, capacity);
|
|
1350
|
+
if (!new_output) break;
|
|
1351
|
+
output = new_output;
|
|
1352
|
+
write = output + written;
|
|
1353
|
+
}
|
|
1354
|
+
memcpy(write, tbody_close, tbody_close_len);
|
|
1355
|
+
write += tbody_close_len;
|
|
1356
|
+
written += tbody_close_len;
|
|
1357
|
+
in_tbody = false;
|
|
1358
|
+
|
|
1359
|
+
/* Open tfoot */
|
|
1360
|
+
const char *tfoot_open = "<tfoot>\n";
|
|
1361
|
+
size_t tfoot_open_len = strlen(tfoot_open);
|
|
1362
|
+
while (written + tfoot_open_len > capacity) {
|
|
1363
|
+
capacity *= 2;
|
|
1364
|
+
char *new_output = realloc(output, capacity);
|
|
1365
|
+
if (!new_output) break;
|
|
1366
|
+
output = new_output;
|
|
1367
|
+
write = output + written;
|
|
1368
|
+
}
|
|
1369
|
+
memcpy(write, tfoot_open, tfoot_open_len);
|
|
1370
|
+
write += tfoot_open_len;
|
|
1371
|
+
written += tfoot_open_len;
|
|
1372
|
+
in_tfoot = true;
|
|
1373
|
+
}
|
|
1374
|
+
/* Note: Once we're in tfoot, we stay in tfoot - don't reopen tbody */
|
|
1375
|
+
/* tfoot rows should be at the end of the table */
|
|
1376
|
+
|
|
1377
|
+
/* Check if this row should be completely removed (all cells marked for removal) */
|
|
1378
|
+
/* For tfoot rows that are pure === markers (all cells marked), skip the entire row */
|
|
1379
|
+
/* For tfoot rows with actual content, render them normally */
|
|
1380
|
+
bool should_skip_row = false;
|
|
1381
|
+
|
|
1382
|
+
/* CRITICAL SAFEGUARD: If this is one of the first few rows (row_idx <= 3), it's
|
|
1383
|
+
* almost certainly in tbody, not tfoot. NEVER skip it.
|
|
1384
|
+
* Note: row_idx is incremented before this check (line 894), so:
|
|
1385
|
+
* - row_idx 1 = header
|
|
1386
|
+
* - row_idx 2 = first data row
|
|
1387
|
+
* - row_idx 3 = second data row (this is the one we were missing!)
|
|
1388
|
+
* - row_idx 4 = === row (should be skipped)
|
|
1389
|
+
* - row_idx 5 = footer
|
|
1390
|
+
*
|
|
1391
|
+
* This check must run FIRST, before any other skip logic, to ensure these rows
|
|
1392
|
+
* are always rendered. */
|
|
1393
|
+
bool force_keep_row = false;
|
|
1394
|
+
if (row_idx <= 3) {
|
|
1395
|
+
/* First few rows (header + first two data rows) are always in tbody - never skip them */
|
|
1396
|
+
force_keep_row = true;
|
|
1397
|
+
should_skip_row = false;
|
|
1398
|
+
/* Always set current_row_is_tfoot = false for these rows - they're definitely in tbody */
|
|
1399
|
+
current_row_is_tfoot = false;
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
/* CRITICAL: Only check if row should be skipped if we haven't already determined
|
|
1403
|
+
* it should be in tbody. If current_row_is_tfoot is false, this row was either
|
|
1404
|
+
* never marked as tfoot OR was forced to tbody because it appears before === in HTML.
|
|
1405
|
+
* In either case, it should NOT be skipped, even if all cells are marked for removal.
|
|
1406
|
+
*
|
|
1407
|
+
* IMPORTANT: We must check this AFTER setting current_row_is_tfoot, so we know
|
|
1408
|
+
* if the row was forced to tbody. */
|
|
1409
|
+
if (!force_keep_row && current_row_is_tfoot) {
|
|
1410
|
+
/* Count ALL cells in this row (using all_cells) to see if any are non-removed */
|
|
1411
|
+
int total_cells_in_row = 0;
|
|
1412
|
+
int removed_cells_in_row = 0;
|
|
1413
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1414
|
+
if (c->table_index == table_idx && c->row_index == ast_row_idx) {
|
|
1415
|
+
total_cells_in_row++;
|
|
1416
|
+
if (c->is_removed) {
|
|
1417
|
+
removed_cells_in_row++;
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
/* If all cells in this row are marked for removal, skip the entire row */
|
|
1422
|
+
/* This applies to tfoot rows that are pure === markers */
|
|
1423
|
+
if (total_cells_in_row > 0 && total_cells_in_row == removed_cells_in_row) {
|
|
1424
|
+
should_skip_row = true;
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
/* If current_row_is_tfoot is false, this is a data row that should be rendered,
|
|
1428
|
+
* regardless of whether cells are marked for removal in the AST */
|
|
1429
|
+
/* Also check if this row contains === markers by checking the AST row directly.
|
|
1430
|
+
* This handles cases where the row mapping might have issues.
|
|
1431
|
+
* BUT: Only check for === markers if this row is actually in tfoot.
|
|
1432
|
+
* If current_row_is_tfoot is false (forced to tbody), don't skip it even if it has === markers.
|
|
1433
|
+
*
|
|
1434
|
+
* CRITICAL: Don't run this check if force_keep_row is true. */
|
|
1435
|
+
if (!force_keep_row && !should_skip_row && current_row_is_tfoot) {
|
|
1436
|
+
/* Check if any cells in current AST row contain === */
|
|
1437
|
+
bool has_equals = false;
|
|
1438
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1439
|
+
if (a->table_index == table_idx && a->row_index == ast_row_idx &&
|
|
1440
|
+
strstr(a->attributes, "data-remove")) {
|
|
1441
|
+
if (a->cell_text) {
|
|
1442
|
+
const char *text = a->cell_text;
|
|
1443
|
+
while (*text && isspace((unsigned char)*text)) text++;
|
|
1444
|
+
if (text[0] == '=' && text[1] == '=' && text[2] == '=') {
|
|
1445
|
+
has_equals = true;
|
|
1446
|
+
break;
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
/* If this row contains === markers, skip it */
|
|
1452
|
+
/* BUT: Don't skip if force_keep_row is true (first few rows are protected) */
|
|
1453
|
+
if (has_equals && !force_keep_row) {
|
|
1454
|
+
should_skip_row = true;
|
|
1455
|
+
} else {
|
|
1456
|
+
/* Also check previous AST row in case row mapping is off */
|
|
1457
|
+
/* BUT: Only do this check if we're actually in tfoot.
|
|
1458
|
+
* If current_row_is_tfoot is false, this row was forced to tbody,
|
|
1459
|
+
* so don't skip it even if the previous row has === markers. */
|
|
1460
|
+
if (ast_row_idx > 0 && current_row_is_tfoot) {
|
|
1461
|
+
int prev_ast_row = ast_row_idx - 1;
|
|
1462
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1463
|
+
if (a->table_index == table_idx && a->row_index == prev_ast_row &&
|
|
1464
|
+
strstr(a->attributes, "data-remove") && a->cell_text) {
|
|
1465
|
+
const char *text = a->cell_text;
|
|
1466
|
+
while (*text && isspace((unsigned char)*text)) text++;
|
|
1467
|
+
if (text[0] == '=' && text[1] == '=' && text[2] == '=') {
|
|
1468
|
+
has_equals = true;
|
|
1469
|
+
/* If previous row has === and this is first tfoot row, this HTML row is the === row */
|
|
1470
|
+
/* BUT: Don't skip if this row was forced to tbody (row_idx <= 3) or force_keep_row is true */
|
|
1471
|
+
if (row_idx <= 4 && !force_keep_row && !(min_equals_row_idx >= 0 && row_idx <= 3)) {
|
|
1472
|
+
should_skip_row = true;
|
|
1473
|
+
}
|
|
1474
|
+
break;
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
/* FINAL CHECK: If this row was forced to tbody (because it appears before === in HTML),
|
|
1482
|
+
* NEVER skip it, regardless of any other conditions. This check happens after all
|
|
1483
|
+
* skip logic has been evaluated, to ensure rows forced to tbody are always rendered.
|
|
1484
|
+
*
|
|
1485
|
+
* IMPORTANT: This check must recalculate min_equals_row_idx to ensure it's available,
|
|
1486
|
+
* in case it wasn't calculated earlier or was calculated incorrectly. */
|
|
1487
|
+
bool was_forced_to_tbody = false;
|
|
1488
|
+
|
|
1489
|
+
/* First, find the === row's AST index (recalculate if needed) */
|
|
1490
|
+
int final_min_equals_row_idx = min_equals_row_idx;
|
|
1491
|
+
if (final_min_equals_row_idx < 0) {
|
|
1492
|
+
for (int r = 0; r < 100; r++) {
|
|
1493
|
+
int eq_total = 0;
|
|
1494
|
+
int eq_removed = 0;
|
|
1495
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1496
|
+
if (c->table_index == table_idx && c->row_index == r) {
|
|
1497
|
+
eq_total++;
|
|
1498
|
+
if (c->is_removed) eq_removed++;
|
|
1499
|
+
}
|
|
1500
|
+
}
|
|
1501
|
+
if (eq_total > 0 && eq_total == eq_removed) {
|
|
1502
|
+
final_min_equals_row_idx = r;
|
|
1503
|
+
break;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
if (final_min_equals_row_idx >= 0) {
|
|
1509
|
+
int html_rows_before_equals = -1;
|
|
1510
|
+
for (int r = 0; r < 100 && r <= final_min_equals_row_idx; r++) {
|
|
1511
|
+
bool has_non_removed = false;
|
|
1512
|
+
for (all_cell *c = all_cells; c; c = c->next) {
|
|
1513
|
+
if (c->table_index == table_idx &&
|
|
1514
|
+
c->row_index == r &&
|
|
1515
|
+
!c->is_removed) {
|
|
1516
|
+
has_non_removed = true;
|
|
1517
|
+
break;
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
if (has_non_removed) {
|
|
1521
|
+
html_rows_before_equals++;
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
/* Check HTML position: if row_idx is <= html_rows_before_equals + 1,
|
|
1525
|
+
* it appears before the === row in HTML, so it must be in tbody */
|
|
1526
|
+
if (html_rows_before_equals >= 0 && row_idx <= html_rows_before_equals + 1) {
|
|
1527
|
+
was_forced_to_tbody = true;
|
|
1528
|
+
}
|
|
1529
|
+
/* Also check AST position as a fallback: if ast_row_idx <= final_min_equals_row_idx,
|
|
1530
|
+
* it's before or at the === row in AST, so it must be in tbody */
|
|
1531
|
+
if (!was_forced_to_tbody && ast_row_idx >= 0 && ast_row_idx <= final_min_equals_row_idx) {
|
|
1532
|
+
was_forced_to_tbody = true;
|
|
1533
|
+
}
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
/* CRITICAL: If this row was forced to tbody, NEVER skip it and ensure it's not in tfoot.
|
|
1537
|
+
* This check runs AFTER all skip logic, to ensure rows forced to tbody are always rendered.
|
|
1538
|
+
*
|
|
1539
|
+
* ADDITIONAL SAFEGUARD: If there's a === row and this is one of the first 3 HTML rows
|
|
1540
|
+
* (row_idx <= 3), it's almost certainly in tbody, not tfoot. This handles edge cases
|
|
1541
|
+
* where the calculation might be off.
|
|
1542
|
+
*
|
|
1543
|
+
* FINAL OVERRIDE: This is the last check before skipping, so it must override any
|
|
1544
|
+
* previous skip decisions. Also check force_keep_row flag. */
|
|
1545
|
+
if (force_keep_row || was_forced_to_tbody || (final_min_equals_row_idx >= 0 && row_idx <= 3) ||
|
|
1546
|
+
(min_equals_row_idx >= 0 && row_idx <= 3)) {
|
|
1547
|
+
should_skip_row = false;
|
|
1548
|
+
current_row_is_tfoot = false;
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1551
|
+
/* FINAL CHECK: If force_keep_row is true, NEVER skip this row, regardless of should_skip_row.
|
|
1552
|
+
* Also, if row_idx <= 3, protect it as an extra safeguard. */
|
|
1553
|
+
if (force_keep_row || row_idx <= 3) {
|
|
1554
|
+
should_skip_row = false;
|
|
1555
|
+
/* If this is one of the first few rows and there's a === row, it's definitely in tbody */
|
|
1556
|
+
if (row_idx <= 3 && min_equals_row_idx >= 0) {
|
|
1557
|
+
current_row_is_tfoot = false;
|
|
1558
|
+
}
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
if (should_skip_row) {
|
|
1562
|
+
/* Skip the opening <tr> tag and everything until </tr> */
|
|
1563
|
+
read += 4;
|
|
1564
|
+
const char *tr_end = strstr(read, "</tr>");
|
|
1565
|
+
if (tr_end) {
|
|
1566
|
+
read = tr_end + 5;
|
|
1567
|
+
} else {
|
|
1568
|
+
/* Fallback: skip to next tag */
|
|
1569
|
+
while (*read && strncmp(read, "</tr>", 5) != 0) read++;
|
|
1570
|
+
if (*read) read += 5;
|
|
1571
|
+
}
|
|
1572
|
+
continue; /* Don't set in_row, skip to next iteration (won't copy <tr>) */
|
|
1573
|
+
}
|
|
1574
|
+
/* Otherwise, this is a normal row (or tfoot row) - copy <tr> and process cells */
|
|
1575
|
+
/* Set in_row BEFORE processing cells so detection code can run */
|
|
1576
|
+
in_row = true;
|
|
1577
|
+
/* Copy the <tr> tag */
|
|
1578
|
+
while (*read && *read != '>') {
|
|
1579
|
+
*write++ = *read++;
|
|
1580
|
+
written++;
|
|
1581
|
+
}
|
|
1582
|
+
if (*read == '>') {
|
|
1583
|
+
*write++ = *read++;
|
|
1584
|
+
written++;
|
|
1585
|
+
}
|
|
1586
|
+
} else if (in_row && strncmp(read, "</tr>", 5) == 0) {
|
|
1587
|
+
in_row = false;
|
|
1588
|
+
} else if (strncmp(read, "<p>", 3) == 0) {
|
|
1589
|
+
/* Check if this paragraph should be removed */
|
|
1590
|
+
para_idx++;
|
|
1591
|
+
para_to_remove *para_remove = NULL;
|
|
1592
|
+
for (para_to_remove *p = paras_to_remove; p; p = p->next) {
|
|
1593
|
+
if (p->para_index == para_idx) {
|
|
1594
|
+
para_remove = p;
|
|
1595
|
+
break;
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
/* Extract paragraph text to check */
|
|
1600
|
+
const char *para_start = read + 3;
|
|
1601
|
+
const char *para_end = strstr(para_start, "</p>");
|
|
1602
|
+
if (para_end) {
|
|
1603
|
+
/* Check if paragraph starts with [ (caption format) or : (colon caption format) */
|
|
1604
|
+
const char *text_start = para_start;
|
|
1605
|
+
/* Skip any leading whitespace */
|
|
1606
|
+
while (*text_start && text_start < para_end && isspace((unsigned char)*text_start)) text_start++;
|
|
1607
|
+
|
|
1608
|
+
bool is_caption_para = false;
|
|
1609
|
+
if (*text_start == '[' ||
|
|
1610
|
+
(text_start < para_end - 4 && strncmp(text_start, "<", 4) == 0)) {
|
|
1611
|
+
/* This looks like a bracket caption paragraph */
|
|
1612
|
+
is_caption_para = true;
|
|
1613
|
+
} else if (*text_start == ':' && (text_start + 1) < para_end &&
|
|
1614
|
+
(text_start[1] == ' ' || text_start[1] == '\t')) {
|
|
1615
|
+
/* This looks like a : Caption format paragraph */
|
|
1616
|
+
is_caption_para = true;
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
if (is_caption_para) {
|
|
1620
|
+
/* This is a caption paragraph */
|
|
1621
|
+
|
|
1622
|
+
/* First check: if we have a fingerprint match from AST */
|
|
1623
|
+
if (para_remove && para_remove->text_fingerprint) {
|
|
1624
|
+
const char *fingerprint_text = para_remove->text_fingerprint;
|
|
1625
|
+
if (fingerprint_text && strlen(fingerprint_text) > 0) {
|
|
1626
|
+
/* Simple substring match - check if fingerprint appears anywhere in paragraph content */
|
|
1627
|
+
if (strstr(para_start, fingerprint_text) != NULL) {
|
|
1628
|
+
/* Skip this entire paragraph */
|
|
1629
|
+
read = para_end + 4; /* Skip past </p> */
|
|
1630
|
+
continue; /* Skip normal copy */
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
/* Second check: if we're near a table/figure that has a caption, remove this paragraph */
|
|
1636
|
+
/* This is a fallback for cases where fingerprint matching fails */
|
|
1637
|
+
/* Check if any table has a caption, and if this paragraph's caption text matches */
|
|
1638
|
+
const char *caption_start = NULL;
|
|
1639
|
+
const char *caption_end = NULL;
|
|
1640
|
+
|
|
1641
|
+
if (*text_start == '[') {
|
|
1642
|
+
/* Bracket format: [Caption] */
|
|
1643
|
+
caption_start = text_start + 1;
|
|
1644
|
+
caption_end = strchr(caption_start, ']');
|
|
1645
|
+
if (!caption_end || caption_end >= para_end) {
|
|
1646
|
+
/* Try HTML entity version */
|
|
1647
|
+
const char *gt_entity = strstr(caption_start, ">");
|
|
1648
|
+
if (gt_entity && gt_entity < para_end) caption_end = gt_entity;
|
|
1649
|
+
}
|
|
1650
|
+
} else if (text_start < para_end - 4 && strncmp(text_start, "<", 4) == 0) {
|
|
1651
|
+
/* HTML entity bracket format: <Caption> */
|
|
1652
|
+
caption_start = text_start + 4;
|
|
1653
|
+
caption_end = strstr(caption_start, ">");
|
|
1654
|
+
} else if (*text_start == ':' && (text_start + 1) < para_end &&
|
|
1655
|
+
(text_start[1] == ' ' || text_start[1] == '\t')) {
|
|
1656
|
+
/* Colon format: : Caption */
|
|
1657
|
+
caption_start = text_start + 2; /* Skip : and space */
|
|
1658
|
+
/* Find IAL at the end (if any) - look for { */
|
|
1659
|
+
caption_end = para_end;
|
|
1660
|
+
const char *ial_search = para_end - 1;
|
|
1661
|
+
while (ial_search >= caption_start) {
|
|
1662
|
+
if (*ial_search == '}') {
|
|
1663
|
+
/* Found closing brace, look backwards for opening brace */
|
|
1664
|
+
const char *open = ial_search;
|
|
1665
|
+
while (open >= caption_start && *open != '{') {
|
|
1666
|
+
open--;
|
|
1667
|
+
}
|
|
1668
|
+
if (open >= caption_start && *open == '{') {
|
|
1669
|
+
/* Check if it's a valid IAL pattern */
|
|
1670
|
+
if ((open[1] == ':' || open[1] == '#' || open[1] == '.') &&
|
|
1671
|
+
ial_search > open) {
|
|
1672
|
+
caption_end = open; /* Caption ends before IAL */
|
|
1673
|
+
break;
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
ial_search--;
|
|
1678
|
+
}
|
|
1679
|
+
/* Trim whitespace from caption */
|
|
1680
|
+
while (caption_start < caption_end && isspace((unsigned char)*caption_start)) {
|
|
1681
|
+
caption_start++;
|
|
1682
|
+
}
|
|
1683
|
+
while (caption_end > caption_start && isspace((unsigned char)*(caption_end - 1))) {
|
|
1684
|
+
caption_end--;
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
if (caption_start && caption_end && caption_end > caption_start && caption_end < para_end) {
|
|
1689
|
+
size_t caption_len = caption_end - caption_start;
|
|
1690
|
+
if (caption_len > 0 && caption_len < 512) {
|
|
1691
|
+
char para_caption[512];
|
|
1692
|
+
memcpy(para_caption, caption_start, caption_len);
|
|
1693
|
+
para_caption[caption_len] = '\0';
|
|
1694
|
+
/* Compare with all table captions - if any match, remove this paragraph */
|
|
1695
|
+
for (table_caption *cap = captions; cap; cap = cap->next) {
|
|
1696
|
+
if (cap->caption && strcmp(para_caption, cap->caption) == 0) {
|
|
1697
|
+
/* Match! Remove this paragraph */
|
|
1698
|
+
read = para_end + 4;
|
|
1699
|
+
continue;
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
/* Check for cell opening tags - process both header and body cells
|
|
1709
|
+
* IMPORTANT: Check for <thead> and <tbody> first to avoid matching them as <th> or <td> cells */
|
|
1710
|
+
if ((in_row || in_thead) &&
|
|
1711
|
+
(strncmp(read, "<td", 3) == 0 || strncmp(read, "<th", 3) == 0) &&
|
|
1712
|
+
strncmp(read, "<thead>", 7) != 0 && strncmp(read, "<tbody>", 7) != 0 &&
|
|
1713
|
+
strncmp(read, "<tfoot>", 7) != 0) {
|
|
1714
|
+
bool is_th = strncmp(read, "<th", 3) == 0 && strncmp(read, "<thead>", 7) != 0;
|
|
1715
|
+
/* Extract cell content for debugging and matching - only if we need it */
|
|
1716
|
+
char cell_preview[100] = {0};
|
|
1717
|
+
|
|
1718
|
+
/* Extract cell content if we need it for:
|
|
1719
|
+
* - Header row detection (first header cell only)
|
|
1720
|
+
* - Attribute matching (only if we need content verification)
|
|
1721
|
+
* Note: Alignment processing extracts content separately and only when needed */
|
|
1722
|
+
bool need_cell_content_for_header = (in_table && !in_tbody && !in_tfoot && in_thead &&
|
|
1723
|
+
table_idx >= 0 && table_idx < 50 &&
|
|
1724
|
+
row_idx == 0 && col_idx == 0 && is_th);
|
|
1725
|
+
bool need_cell_content = need_cell_content_for_header;
|
|
1726
|
+
|
|
1727
|
+
/* For attribute matching, we'll extract content only if we find a potential match
|
|
1728
|
+
* that requires content verification (e.g., when there are multiple candidates) */
|
|
1729
|
+
if (need_cell_content) {
|
|
1730
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
1731
|
+
const char *content_start = strchr(read, '>');
|
|
1732
|
+
if (content_start) {
|
|
1733
|
+
const char *content_end = strstr(content_start + 1, close_tag);
|
|
1734
|
+
if (content_end) {
|
|
1735
|
+
/* Extract just the content between > and </td>/</th> */
|
|
1736
|
+
/* For empty cells like <th></th>, content_end is right after content_start */
|
|
1737
|
+
size_t len = content_end - content_start - 1;
|
|
1738
|
+
if (len >= 0 && len < 99) {
|
|
1739
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
1740
|
+
cell_preview[len] = '\0';
|
|
1741
|
+
/* Trim trailing whitespace and newlines */
|
|
1742
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
1743
|
+
cell_preview[--len] = '\0';
|
|
1744
|
+
}
|
|
1745
|
+
} else if (len == 0) {
|
|
1746
|
+
/* Empty cell like <th></th> - set cell_preview to empty string */
|
|
1747
|
+
cell_preview[0] = '\0';
|
|
1748
|
+
}
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
/* Detect header row with empty first cell to enable row-header column.
|
|
1754
|
+
* We only consider the first header row's first cell (<thead>, first row, col_idx == 0).
|
|
1755
|
+
* row_idx starts at -1 and is incremented to 0 for the first <tr> (header row).
|
|
1756
|
+
* After the first <tr> increment, row_idx is 0 for the header row.
|
|
1757
|
+
* Note: Comments suggest row_idx is 1-based, but code shows it's 0-based (starts at -1, becomes 0). */
|
|
1758
|
+
if (in_table && !in_tbody && !in_tfoot && in_thead &&
|
|
1759
|
+
table_idx >= 0 && table_idx < 50 &&
|
|
1760
|
+
row_idx == 0 && col_idx == 0 && is_th) {
|
|
1761
|
+
/* Check if this cell is empty - extract directly from HTML for reliability.
|
|
1762
|
+
* read points to the start of the <th tag, so we need to find the > immediately after it. */
|
|
1763
|
+
bool header_first_cell_empty = false;
|
|
1764
|
+
/* Find the > that closes this <th tag - it should be right after <th or <th with attributes */
|
|
1765
|
+
const char *tag_end = strchr(read, '>');
|
|
1766
|
+
if (tag_end && tag_end > read && tag_end < read + 20) {
|
|
1767
|
+
/* Found the > for this <th tag - now find the matching </th> immediately following */
|
|
1768
|
+
const char *content_start = tag_end + 1;
|
|
1769
|
+
const char *content_end = strstr(content_start, "</th>");
|
|
1770
|
+
if (content_end && content_end < read + 100) {
|
|
1771
|
+
/* Check if content between > and </th> is empty or whitespace only */
|
|
1772
|
+
/* For empty cells like <th></th>, content_end should be right after content_start */
|
|
1773
|
+
header_first_cell_empty = true;
|
|
1774
|
+
for (const char *p = content_start; p < content_end; p++) {
|
|
1775
|
+
if (!isspace((unsigned char)*p)) {
|
|
1776
|
+
header_first_cell_empty = false;
|
|
1777
|
+
break;
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
} else {
|
|
1781
|
+
/* No closing tag found in reasonable distance - treat as empty */
|
|
1782
|
+
header_first_cell_empty = true;
|
|
1783
|
+
}
|
|
1784
|
+
} else {
|
|
1785
|
+
/* No > found in reasonable distance - treat as empty (shouldn't happen) */
|
|
1786
|
+
header_first_cell_empty = true;
|
|
1787
|
+
}
|
|
1788
|
+
if (header_first_cell_empty) {
|
|
1789
|
+
table_has_row_header_first_col[table_idx] = true;
|
|
1790
|
+
}
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
/* Use pre-calculated mapping: HTML position -> original column index */
|
|
1794
|
+
int target_original_col = -1;
|
|
1795
|
+
if (col_idx < row_col_mapping_size) {
|
|
1796
|
+
target_original_col = row_col_mapping[col_idx];
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
/* Find matching attribute using the mapped original column index and AST row index.
|
|
1800
|
+
* Also verify that the cell content matches to avoid matching cells that are covered
|
|
1801
|
+
* by rowspans from previous rows.
|
|
1802
|
+
*
|
|
1803
|
+
* If no match found in current row, also check the previous AST row in case of row
|
|
1804
|
+
* detection issues (e.g., when a cell with rowspan is processed in the wrong HTML row).
|
|
1805
|
+
*
|
|
1806
|
+
* For tfoot rows with === markers, also check the previous AST row since the row mapping
|
|
1807
|
+
* might skip the === row if all its cells are marked for removal. */
|
|
1808
|
+
cell_attr *matching = NULL;
|
|
1809
|
+
|
|
1810
|
+
if (target_original_col >= 0 && attrs != NULL) {
|
|
1811
|
+
/* First, try to match in the current AST row by position (fast) */
|
|
1812
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1813
|
+
if (a->table_index == table_idx &&
|
|
1814
|
+
a->row_index == ast_row_idx &&
|
|
1815
|
+
a->col_index == target_original_col) {
|
|
1816
|
+
/* Found a position match - only verify content if we have cell_text to compare */
|
|
1817
|
+
bool content_matches = true;
|
|
1818
|
+
if (a->cell_text && a->cell_text[0] != '\0') {
|
|
1819
|
+
/* Need to extract cell content for verification (lazy extraction) */
|
|
1820
|
+
if (cell_preview[0] == '\0') {
|
|
1821
|
+
bool is_th = strncmp(read, "<th", 3) == 0;
|
|
1822
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
1823
|
+
const char *content_start = strchr(read, '>');
|
|
1824
|
+
if (content_start) {
|
|
1825
|
+
const char *content_end = strstr(content_start + 1, close_tag);
|
|
1826
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
1827
|
+
size_t len = content_end - content_start - 1;
|
|
1828
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
1829
|
+
cell_preview[len] = '\0';
|
|
1830
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
1831
|
+
cell_preview[--len] = '\0';
|
|
1832
|
+
}
|
|
1833
|
+
}
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1836
|
+
if (cell_preview[0] != '\0') {
|
|
1837
|
+
/* Compare cell content - trim whitespace for comparison */
|
|
1838
|
+
const char *attr_text = a->cell_text;
|
|
1839
|
+
const char *html_text = cell_preview;
|
|
1840
|
+
/* Skip leading whitespace */
|
|
1841
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
1842
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
1843
|
+
/* Compare (case-sensitive, but we can make it more lenient if needed) */
|
|
1844
|
+
content_matches = (strncmp(attr_text, html_text, strlen(attr_text)) == 0 &&
|
|
1845
|
+
(html_text[strlen(attr_text)] == '\0' || isspace((unsigned char)html_text[strlen(attr_text)])));
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
if (content_matches) {
|
|
1849
|
+
matching = a;
|
|
1850
|
+
break;
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
/* If no match found in current row, also check the previous AST row.
|
|
1856
|
+
* This is especially important for tfoot rows with === markers, where the row mapping
|
|
1857
|
+
* might skip the === row if all its cells are marked for removal. */
|
|
1858
|
+
if (!matching && ast_row_idx > 0) {
|
|
1859
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1860
|
+
if (a->table_index == table_idx &&
|
|
1861
|
+
a->row_index == ast_row_idx - 1 &&
|
|
1862
|
+
a->col_index == target_original_col &&
|
|
1863
|
+
strstr(a->attributes, "data-remove")) {
|
|
1864
|
+
/* Check if content matches (for === cells) - extract if needed */
|
|
1865
|
+
bool content_matches = true;
|
|
1866
|
+
if (a->cell_text && a->cell_text[0] != '\0') {
|
|
1867
|
+
if (cell_preview[0] == '\0') {
|
|
1868
|
+
bool is_th = strncmp(read, "<th", 3) == 0;
|
|
1869
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
1870
|
+
const char *content_start = strchr(read, '>');
|
|
1871
|
+
if (content_start) {
|
|
1872
|
+
const char *content_end = strstr(content_start + 1, close_tag);
|
|
1873
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
1874
|
+
size_t len = content_end - content_start - 1;
|
|
1875
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
1876
|
+
cell_preview[len] = '\0';
|
|
1877
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
1878
|
+
cell_preview[--len] = '\0';
|
|
1879
|
+
}
|
|
1880
|
+
}
|
|
1881
|
+
}
|
|
1882
|
+
}
|
|
1883
|
+
if (cell_preview[0] != '\0') {
|
|
1884
|
+
const char *attr_text = a->cell_text;
|
|
1885
|
+
const char *html_text = cell_preview;
|
|
1886
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
1887
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
1888
|
+
content_matches = (strncmp(attr_text, html_text, strlen(attr_text)) == 0 &&
|
|
1889
|
+
(html_text[strlen(attr_text)] == '\0' || isspace((unsigned char)html_text[strlen(attr_text)])));
|
|
1890
|
+
}
|
|
1891
|
+
}
|
|
1892
|
+
if (content_matches) {
|
|
1893
|
+
matching = a;
|
|
1894
|
+
break;
|
|
1895
|
+
}
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
/* Don't use fallback matching to previous row - it causes incorrect attribute application.
|
|
1901
|
+
* If no match is found in the current row, that's correct - the cell doesn't have any attributes. */
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1904
|
+
/* Content-based fallback matching for header/footer rows (or when column-based matching fails).
|
|
1905
|
+
* This is important because column mapping can be wrong for rows with colspans.
|
|
1906
|
+
* Match cells by content and prioritize cells with colspan/rowspan attributes.
|
|
1907
|
+
* Skip this expensive operation for very large tables to avoid timeout. */
|
|
1908
|
+
/* Extract cell content if we need it for content-based or rowspan matching */
|
|
1909
|
+
if (attrs != NULL && !matching && cell_preview[0] == '\0' && ast_row_idx >= 0) {
|
|
1910
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
1911
|
+
const char *content_start = strchr(read, '>');
|
|
1912
|
+
if (content_start) {
|
|
1913
|
+
const char *content_end = strstr(content_start + 1, close_tag);
|
|
1914
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
1915
|
+
size_t len = content_end - content_start - 1;
|
|
1916
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
1917
|
+
cell_preview[len] = '\0';
|
|
1918
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
1919
|
+
cell_preview[--len] = '\0';
|
|
1920
|
+
}
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
}
|
|
1924
|
+
if (attrs != NULL && !matching && ast_row_idx >= 0) {
|
|
1925
|
+
/* Extract cell content if we haven't already */
|
|
1926
|
+
if (cell_preview[0] == '\0') {
|
|
1927
|
+
bool is_th_tag = strncmp(read, "<th", 3) == 0;
|
|
1928
|
+
const char *close_tag_str = is_th_tag ? "</th>" : "</td>";
|
|
1929
|
+
const char *content_start = strchr(read, '>');
|
|
1930
|
+
if (content_start) {
|
|
1931
|
+
const char *content_end = strstr(content_start + 1, close_tag_str);
|
|
1932
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
1933
|
+
size_t len = content_end - content_start - 1;
|
|
1934
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
1935
|
+
cell_preview[len] = '\0';
|
|
1936
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
1937
|
+
cell_preview[--len] = '\0';
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
|
|
1943
|
+
if (cell_preview[0] != '\0') {
|
|
1944
|
+
/* For very large tables, skip content-based matching to avoid timeout */
|
|
1945
|
+
/* Position-based matching should be sufficient for most cases */
|
|
1946
|
+
int attr_count = 0;
|
|
1947
|
+
for (cell_attr *check = attrs; check && attr_count < 1000; check = check->next) attr_count++;
|
|
1948
|
+
if (attr_count <= 500) {
|
|
1949
|
+
/* Only do expensive content-based matching if we don't have too many attributes */
|
|
1950
|
+
cell_attr *content_match = NULL;
|
|
1951
|
+
cell_attr *span_match = NULL;
|
|
1952
|
+
|
|
1953
|
+
/* Try to find a cell in the same AST row by matching content */
|
|
1954
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
1955
|
+
if (a->table_index == table_idx &&
|
|
1956
|
+
a->row_index == ast_row_idx &&
|
|
1957
|
+
a->cell_text) {
|
|
1958
|
+
const char *attr_text = a->cell_text;
|
|
1959
|
+
const char *html_text = cell_preview;
|
|
1960
|
+
/* Skip leading whitespace */
|
|
1961
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
1962
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
1963
|
+
/* Compare - use lenient comparison */
|
|
1964
|
+
size_t attr_len = strlen(attr_text);
|
|
1965
|
+
size_t html_len = strlen(html_text);
|
|
1966
|
+
/* Skip trailing whitespace for comparison */
|
|
1967
|
+
while (attr_len > 0 && isspace((unsigned char)attr_text[attr_len - 1])) attr_len--;
|
|
1968
|
+
while (html_len > 0 && isspace((unsigned char)html_text[html_len - 1])) html_len--;
|
|
1969
|
+
if (attr_len > 0 && html_len > 0 &&
|
|
1970
|
+
attr_len == html_len &&
|
|
1971
|
+
strncmp(attr_text, html_text, attr_len) == 0) {
|
|
1972
|
+
/* Found a match by content */
|
|
1973
|
+
if (!content_match) {
|
|
1974
|
+
content_match = a; /* Remember first content match */
|
|
1975
|
+
}
|
|
1976
|
+
/* Prefer matches with colspan/rowspan attributes */
|
|
1977
|
+
if (strstr(a->attributes, "colspan") || strstr(a->attributes, "rowspan")) {
|
|
1978
|
+
span_match = a;
|
|
1979
|
+
break; /* Found the best match, stop searching */
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
/* Use span_match if available, otherwise use content_match */
|
|
1986
|
+
if (span_match) {
|
|
1987
|
+
matching = span_match;
|
|
1988
|
+
} else if (content_match) {
|
|
1989
|
+
matching = content_match;
|
|
1990
|
+
}
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
|
|
1995
|
+
/* Additional fallback: If we still don't have a match, try to find any cell in the same
|
|
1996
|
+
* table/row with matching content that has colspan. This is important when cells with <<
|
|
1997
|
+
* are removed and column indices shift. Extract cell content if we haven't already. */
|
|
1998
|
+
if (!matching && ast_row_idx >= 0 && attrs != NULL) {
|
|
1999
|
+
/* Extract cell content if we haven't already */
|
|
2000
|
+
if (cell_preview[0] == '\0') {
|
|
2001
|
+
bool is_th_tag = strncmp(read, "<th", 3) == 0;
|
|
2002
|
+
const char *close_tag_str = is_th_tag ? "</th>" : "</td>";
|
|
2003
|
+
const char *content_start = strchr(read, '>');
|
|
2004
|
+
if (content_start) {
|
|
2005
|
+
const char *content_end = strstr(content_start + 1, close_tag_str);
|
|
2006
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
2007
|
+
size_t len = content_end - content_start - 1;
|
|
2008
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
2009
|
+
cell_preview[len] = '\0';
|
|
2010
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
2011
|
+
cell_preview[--len] = '\0';
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
|
|
2017
|
+
if (cell_preview[0] != '\0') {
|
|
2018
|
+
/* Try matching within the same AST row first */
|
|
2019
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
2020
|
+
if (a->table_index == table_idx &&
|
|
2021
|
+
a->row_index == ast_row_idx &&
|
|
2022
|
+
strstr(a->attributes, "colspan") &&
|
|
2023
|
+
a->cell_text) {
|
|
2024
|
+
const char *attr_text = a->cell_text;
|
|
2025
|
+
const char *html_text = cell_preview;
|
|
2026
|
+
/* Trim whitespace */
|
|
2027
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
2028
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
2029
|
+
size_t attr_len = strlen(attr_text);
|
|
2030
|
+
size_t html_len = strlen(html_text);
|
|
2031
|
+
while (attr_len > 0 && isspace((unsigned char)attr_text[attr_len - 1])) attr_len--;
|
|
2032
|
+
while (html_len > 0 && isspace((unsigned char)html_text[html_len - 1])) html_len--;
|
|
2033
|
+
if (attr_len > 0 && html_len > 0 &&
|
|
2034
|
+
attr_len == html_len &&
|
|
2035
|
+
strncmp(attr_text, html_text, attr_len) == 0) {
|
|
2036
|
+
fprintf(stderr, "DEBUG MATCH: Found colspan cell by content fallback (same row) - text=[%s] attrs=[%s]\n",
|
|
2037
|
+
html_text, a->attributes);
|
|
2038
|
+
matching = a;
|
|
2039
|
+
break;
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
2042
|
+
}
|
|
2043
|
+
|
|
2044
|
+
/* If no match found, try nearby rows (row index might be off by 1 due to removed rows) */
|
|
2045
|
+
if (!matching) {
|
|
2046
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
2047
|
+
if (a->table_index == table_idx &&
|
|
2048
|
+
(a->row_index == ast_row_idx || a->row_index == ast_row_idx - 1 || a->row_index == ast_row_idx + 1) &&
|
|
2049
|
+
strstr(a->attributes, "colspan") &&
|
|
2050
|
+
a->cell_text) {
|
|
2051
|
+
const char *attr_text = a->cell_text;
|
|
2052
|
+
const char *html_text = cell_preview;
|
|
2053
|
+
/* Trim whitespace */
|
|
2054
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
2055
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
2056
|
+
size_t attr_len = strlen(attr_text);
|
|
2057
|
+
size_t html_len = strlen(html_text);
|
|
2058
|
+
while (attr_len > 0 && isspace((unsigned char)attr_text[attr_len - 1])) attr_len--;
|
|
2059
|
+
while (html_len > 0 && isspace((unsigned char)html_text[html_len - 1])) html_len--;
|
|
2060
|
+
if (attr_len > 0 && html_len > 0 &&
|
|
2061
|
+
attr_len == html_len &&
|
|
2062
|
+
strncmp(attr_text, html_text, attr_len) == 0) {
|
|
2063
|
+
matching = a;
|
|
2064
|
+
break;
|
|
2065
|
+
}
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
2068
|
+
}
|
|
2069
|
+
}
|
|
2070
|
+
}
|
|
2071
|
+
|
|
2072
|
+
/* Final safety fallback for rowspan cells:
|
|
2073
|
+
* If we still don't have a match, but this HTML cell's text matches exactly one AST cell
|
|
2074
|
+
* in the same table that has a rowspan attribute, use that. This ensures that rowspans
|
|
2075
|
+
* computed in the AST (e.g., for the last ^^ in a block) always get injected, even if
|
|
2076
|
+
* row/column mapping was slightly off.
|
|
2077
|
+
*
|
|
2078
|
+
* To avoid mis-applying attributes, we:
|
|
2079
|
+
* - Require an exact trimmed-text match
|
|
2080
|
+
* - Restrict to cells in the same table
|
|
2081
|
+
* - Restrict to cells that already have a \"rowspan\" attribute
|
|
2082
|
+
* - Require that the match be unique (only one candidate) */
|
|
2083
|
+
/* Extract cell content if we need it for rowspan matching */
|
|
2084
|
+
if (attrs != NULL && !matching && cell_preview[0] == '\0' && ast_row_idx >= 0) {
|
|
2085
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
2086
|
+
const char *content_start = strchr(read, '>');
|
|
2087
|
+
if (content_start) {
|
|
2088
|
+
const char *content_end = strstr(content_start + 1, close_tag);
|
|
2089
|
+
if (content_end && content_end - content_start - 1 < 99) {
|
|
2090
|
+
size_t len = content_end - content_start - 1;
|
|
2091
|
+
strncpy(cell_preview, content_start + 1, len);
|
|
2092
|
+
cell_preview[len] = '\0';
|
|
2093
|
+
while (len > 0 && (cell_preview[len-1] == '\n' || cell_preview[len-1] == '\r' || isspace((unsigned char)cell_preview[len-1]))) {
|
|
2094
|
+
cell_preview[--len] = '\0';
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
}
|
|
2099
|
+
if (attrs != NULL && !matching && cell_preview[0] != '\0' && ast_row_idx >= 0) {
|
|
2100
|
+
cell_attr *rowspan_candidate = NULL;
|
|
2101
|
+
bool multiple_candidates = false;
|
|
2102
|
+
|
|
2103
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
2104
|
+
if (a->table_index != table_idx) continue;
|
|
2105
|
+
/* Only consider cells in the same AST row, or at most one row above.
|
|
2106
|
+
* This allows us to recover from small row-mapping off-by-one errors
|
|
2107
|
+
* (e.g., when a header/body boundary shifts indices by 1), while
|
|
2108
|
+
* still preventing rowspans from leaking far down into unrelated
|
|
2109
|
+
* rows (like a later \"Active\" block). */
|
|
2110
|
+
int row_diff = ast_row_idx - a->row_index;
|
|
2111
|
+
if (row_diff < 0 || row_diff > 1) continue;
|
|
2112
|
+
if (!a->cell_text) continue;
|
|
2113
|
+
if (!strstr(a->attributes, "rowspan")) continue;
|
|
2114
|
+
|
|
2115
|
+
const char *attr_text = a->cell_text;
|
|
2116
|
+
const char *html_text = cell_preview;
|
|
2117
|
+
|
|
2118
|
+
/* Trim leading whitespace */
|
|
2119
|
+
while (*attr_text && isspace((unsigned char)*attr_text)) attr_text++;
|
|
2120
|
+
while (*html_text && isspace((unsigned char)*html_text)) html_text++;
|
|
2121
|
+
|
|
2122
|
+
/* Compute trimmed lengths */
|
|
2123
|
+
size_t attr_len = strlen(attr_text);
|
|
2124
|
+
size_t html_len = strlen(html_text);
|
|
2125
|
+
while (attr_len > 0 && isspace((unsigned char)attr_text[attr_len - 1])) attr_len--;
|
|
2126
|
+
while (html_len > 0 && isspace((unsigned char)html_text[html_len - 1])) html_len--;
|
|
2127
|
+
|
|
2128
|
+
if (attr_len == 0 || html_len == 0) continue;
|
|
2129
|
+
if (attr_len != html_len) continue;
|
|
2130
|
+
if (strncmp(attr_text, html_text, attr_len) != 0) continue;
|
|
2131
|
+
|
|
2132
|
+
/* We have an exact trimmed-text match for a rowspan cell in this table. */
|
|
2133
|
+
if (!rowspan_candidate) {
|
|
2134
|
+
rowspan_candidate = a;
|
|
2135
|
+
} else {
|
|
2136
|
+
/* More than one candidate with same text - ambiguous, bail out. */
|
|
2137
|
+
multiple_candidates = true;
|
|
2138
|
+
break;
|
|
2139
|
+
}
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
if (rowspan_candidate && !multiple_candidates) {
|
|
2143
|
+
matching = rowspan_candidate;
|
|
2144
|
+
}
|
|
2145
|
+
}
|
|
2146
|
+
|
|
2147
|
+
/* Also check if this cell contains "^^" (rowspan marker) - these should be removed */
|
|
2148
|
+
/* Check both the preview and the actual content in the HTML */
|
|
2149
|
+
bool is_rowspan_marker = (strstr(cell_preview, "^^") != NULL);
|
|
2150
|
+
if (!is_rowspan_marker) {
|
|
2151
|
+
/* Also check the actual HTML content for "^^" */
|
|
2152
|
+
const char *content_check = strstr(read, ">");
|
|
2153
|
+
if (content_check) {
|
|
2154
|
+
const char *close_check = strstr(content_check + 1, "</td>");
|
|
2155
|
+
if (!close_check) close_check = strstr(content_check + 1, "</th>");
|
|
2156
|
+
if (close_check && close_check - content_check - 1 < 100) {
|
|
2157
|
+
char check_buf[100];
|
|
2158
|
+
strncpy(check_buf, content_check + 1, close_check - content_check - 1);
|
|
2159
|
+
check_buf[close_check - content_check - 1] = '\0';
|
|
2160
|
+
is_rowspan_marker = (strstr(check_buf, "^^") != NULL);
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
/* Also check if this cell contains "<< " or "<<" (colspan marker) - these should be removed */
|
|
2166
|
+
/* Check both the preview and the actual content in the HTML */
|
|
2167
|
+
bool is_colspan_marker = (strstr(cell_preview, "<<") != NULL || strstr(cell_preview, "<<") != NULL);
|
|
2168
|
+
if (!is_colspan_marker) {
|
|
2169
|
+
/* Also check the actual HTML content for "<< " or "<<" */
|
|
2170
|
+
const char *content_check = strstr(read, ">");
|
|
2171
|
+
if (content_check) {
|
|
2172
|
+
const char *close_check = strstr(content_check + 1, "</td>");
|
|
2173
|
+
if (!close_check) close_check = strstr(content_check + 1, "</th>");
|
|
2174
|
+
if (close_check && close_check - content_check - 1 < 100) {
|
|
2175
|
+
char check_buf[100];
|
|
2176
|
+
strncpy(check_buf, content_check + 1, close_check - content_check - 1);
|
|
2177
|
+
check_buf[close_check - content_check - 1] = '\0';
|
|
2178
|
+
is_colspan_marker = (strstr(check_buf, "<<") != NULL || strstr(check_buf, "<<") != NULL);
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
|
|
2183
|
+
/* Check if this cell should be removed:
|
|
2184
|
+
* 1. If it's matched and marked for removal
|
|
2185
|
+
* 2. If it contains "^^" (rowspan marker)
|
|
2186
|
+
* 3. If it contains "<< " or "<<" (colspan marker)
|
|
2187
|
+
* 4. If it's empty and the previous cell in the same row has colspan (empty cells after colspan should be removed) */
|
|
2188
|
+
bool should_remove_cell = false;
|
|
2189
|
+
if (matching && strstr(matching->attributes, "data-remove")) {
|
|
2190
|
+
should_remove_cell = true;
|
|
2191
|
+
} else if (is_rowspan_marker) {
|
|
2192
|
+
should_remove_cell = true;
|
|
2193
|
+
} else if (is_colspan_marker) {
|
|
2194
|
+
should_remove_cell = true;
|
|
2195
|
+
}
|
|
2196
|
+
|
|
2197
|
+
/* Check if this empty cell should be removed.
|
|
2198
|
+
* Only remove empty cells that are explicitly marked for removal in the AST (part of a colspan).
|
|
2199
|
+
* We need to check both:
|
|
2200
|
+
* 1. Cells in the mapping (target_original_col >= 0) - check if marked for removal
|
|
2201
|
+
* 2. Cells not in the mapping (target_original_col < 0) - these might be part of colspan,
|
|
2202
|
+
* but we should only remove if the previous cell in the same row has colspan.
|
|
2203
|
+
*
|
|
2204
|
+
* IMPORTANT: Be conservative - only remove empty cells if we're certain they're part of a colspan.
|
|
2205
|
+
* Don't remove legitimate empty cells. */
|
|
2206
|
+
if (!should_remove_cell && cell_preview[0] == '\0' && ast_row_idx >= 0) {
|
|
2207
|
+
if (target_original_col >= 0) {
|
|
2208
|
+
/* Cell is in the mapping - check if explicitly marked for removal in AST */
|
|
2209
|
+
for (cell_attr *a = attrs; a; a = a->next) {
|
|
2210
|
+
if (a->table_index == table_idx &&
|
|
2211
|
+
a->row_index == ast_row_idx &&
|
|
2212
|
+
a->col_index == target_original_col &&
|
|
2213
|
+
strstr(a->attributes, "data-remove")) {
|
|
2214
|
+
should_remove_cell = true;
|
|
2215
|
+
break;
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
} else if (target_original_col < 0 &&
|
|
2219
|
+
prev_cell_matching &&
|
|
2220
|
+
prev_cell_matching->row_index == ast_row_idx &&
|
|
2221
|
+
strstr(prev_cell_matching->attributes, "colspan")) {
|
|
2222
|
+
/* Cell not in mapping - check if previous cell in same row has colspan > 1.
|
|
2223
|
+
* This is a strong indicator that the empty cell is part of the colspan. */
|
|
2224
|
+
int colspan_val = 1;
|
|
2225
|
+
if (strstr(prev_cell_matching->attributes, "colspan=")) {
|
|
2226
|
+
sscanf(strstr(prev_cell_matching->attributes, "colspan="), "colspan=\"%d\"", &colspan_val);
|
|
2227
|
+
}
|
|
2228
|
+
if (colspan_val > 1) {
|
|
2229
|
+
should_remove_cell = true;
|
|
2230
|
+
}
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
|
|
2234
|
+
if (should_remove_cell) {
|
|
2235
|
+
/* Skip this entire cell (including for tfoot rows - === rows are skipped entirely) */
|
|
2236
|
+
/* Note: Removed cells are not rendered by the HTML renderer, so we shouldn't see them here.
|
|
2237
|
+
* But if we do (e.g., from cmark-gfm before our processing), skip them.
|
|
2238
|
+
* Also remove cells containing "^^" (rowspan markers) even if they're not matched.
|
|
2239
|
+
* We still increment col_idx to match the column index used when collecting attributes
|
|
2240
|
+
* (which counts all cells including removed ones). */
|
|
2241
|
+
bool is_th = strncmp(read, "<th", 3) == 0;
|
|
2242
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
2243
|
+
|
|
2244
|
+
/* Skip opening tag */
|
|
2245
|
+
while (*read && *read != '>') read++;
|
|
2246
|
+
if (*read == '>') read++;
|
|
2247
|
+
|
|
2248
|
+
/* Skip content until closing tag */
|
|
2249
|
+
while (*read && strncmp(read, close_tag, 5) != 0) read++;
|
|
2250
|
+
if (strncmp(read, close_tag, 5) == 0) read += 5;
|
|
2251
|
+
|
|
2252
|
+
col_idx++; /* Increment to match column index from collection (counts all cells) */
|
|
2253
|
+
/* Don't reset prev_cell_matching for removed cells - keep it so we can remove
|
|
2254
|
+
* subsequent empty cells that are part of the same colspan range */
|
|
2255
|
+
continue;
|
|
2256
|
+
} else if (matching && (strstr(matching->attributes, "rowspan") || strstr(matching->attributes, "colspan"))) {
|
|
2257
|
+
/* Copy only the tag name (<td or <th). Skip everything until '>' - do not copy, so we never
|
|
2258
|
+
* include erroneously placed cell content (e.g. <td A > or <tdA>). Then inject our attributes
|
|
2259
|
+
* and write '>'. Existing attributes like align are dropped for this cell to avoid copying
|
|
2260
|
+
* malformed content; alignment can be re-applied by postprocess if needed. */
|
|
2261
|
+
memcpy(write, read, 3);
|
|
2262
|
+
write += 3;
|
|
2263
|
+
read += 3;
|
|
2264
|
+
while (*read && *read != '>') read++;
|
|
2265
|
+
/* Always add a space between tag name and first attribute */
|
|
2266
|
+
*write++ = ' ';
|
|
2267
|
+
/* Inject our attributes (skip leading space in attrs if present) */
|
|
2268
|
+
const char *attr_str = matching->attributes;
|
|
2269
|
+
while (*attr_str == ' ' || *attr_str == '\t') attr_str++;
|
|
2270
|
+
while (*attr_str) {
|
|
2271
|
+
*write++ = *attr_str++;
|
|
2272
|
+
}
|
|
2273
|
+
/* Copy or write the '>' so cell content follows correctly */
|
|
2274
|
+
if (*read == '>') {
|
|
2275
|
+
*write++ = *read++;
|
|
2276
|
+
} else {
|
|
2277
|
+
*write++ = '>';
|
|
2278
|
+
}
|
|
2279
|
+
col_idx++;
|
|
2280
|
+
prev_cell_matching = matching; /* Track this cell for next cell's colspan check */
|
|
2281
|
+
continue;
|
|
2282
|
+
}
|
|
2283
|
+
|
|
2284
|
+
/* Convert first-column body cells to row headers when the header
|
|
2285
|
+
* row's first cell was empty (| | Header ...).
|
|
2286
|
+
* We emit <th scope="row"> for those cells before any alignment processing. */
|
|
2287
|
+
bool make_row_header = false;
|
|
2288
|
+
if (!is_th &&
|
|
2289
|
+
in_tbody &&
|
|
2290
|
+
in_row &&
|
|
2291
|
+
table_idx >= 0 && table_idx < 50 &&
|
|
2292
|
+
table_has_row_header_first_col[table_idx] &&
|
|
2293
|
+
col_idx == 0) {
|
|
2294
|
+
make_row_header = true;
|
|
2295
|
+
}
|
|
2296
|
+
|
|
2297
|
+
/* Also detect empty first header cell here as a fallback, in case the earlier detection didn't run.
|
|
2298
|
+
* This runs for header cells (is_th) when processing the first cell (col_idx == 0) in the first row (row_idx == 0).
|
|
2299
|
+
* We need to check BEFORE col_idx is incremented, so this check happens early in the cell processing. */
|
|
2300
|
+
if (!make_row_header &&
|
|
2301
|
+
in_table && !in_tbody && !in_tfoot && in_thead &&
|
|
2302
|
+
table_idx >= 0 && table_idx < 50 &&
|
|
2303
|
+
row_idx == 0 && col_idx == 0 && is_th) {
|
|
2304
|
+
/* Check if this cell is empty - extract directly from HTML for reliability */
|
|
2305
|
+
bool is_empty = false;
|
|
2306
|
+
const char *close_tag = "</th>";
|
|
2307
|
+
const char *tag_end = strchr(read, '>');
|
|
2308
|
+
if (tag_end && tag_end > read && tag_end < read + 20) {
|
|
2309
|
+
const char *content_start = tag_end + 1;
|
|
2310
|
+
const char *content_end = strstr(content_start, close_tag);
|
|
2311
|
+
if (content_end && content_end < read + 100) {
|
|
2312
|
+
/* Check if content between > and </th> is empty or whitespace only */
|
|
2313
|
+
is_empty = true;
|
|
2314
|
+
for (const char *p = content_start; p < content_end; p++) {
|
|
2315
|
+
if (!isspace((unsigned char)*p)) {
|
|
2316
|
+
is_empty = false;
|
|
2317
|
+
break;
|
|
2318
|
+
}
|
|
2319
|
+
}
|
|
2320
|
+
} else {
|
|
2321
|
+
/* No closing tag found in reasonable distance - treat as empty */
|
|
2322
|
+
is_empty = true;
|
|
2323
|
+
}
|
|
2324
|
+
} else {
|
|
2325
|
+
/* No > found in reasonable distance - treat as empty (shouldn't happen) */
|
|
2326
|
+
is_empty = true;
|
|
2327
|
+
}
|
|
2328
|
+
if (is_empty && !table_has_row_header_first_col[table_idx]) {
|
|
2329
|
+
table_has_row_header_first_col[table_idx] = true;
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
|
|
2333
|
+
if (make_row_header) {
|
|
2334
|
+
const char *tag_end = strchr(read, '>');
|
|
2335
|
+
if (tag_end) {
|
|
2336
|
+
const char *cell_content_start = tag_end + 1;
|
|
2337
|
+
const char *close_tag_td = strstr(cell_content_start, "</td>");
|
|
2338
|
+
const char *close_tag_th = strstr(cell_content_start, "</th>");
|
|
2339
|
+
const char *cell_content_end = NULL;
|
|
2340
|
+
const char *orig_close = NULL;
|
|
2341
|
+
|
|
2342
|
+
if (close_tag_td && (!close_tag_th || close_tag_td < close_tag_th)) {
|
|
2343
|
+
cell_content_end = close_tag_td;
|
|
2344
|
+
orig_close = close_tag_td;
|
|
2345
|
+
} else if (close_tag_th) {
|
|
2346
|
+
cell_content_end = close_tag_th;
|
|
2347
|
+
orig_close = close_tag_th;
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
if (cell_content_end) {
|
|
2351
|
+
/* Write <th scope="row"> */
|
|
2352
|
+
const char *th_open = "<th scope=\"row\">";
|
|
2353
|
+
size_t th_open_len = strlen(th_open);
|
|
2354
|
+
memcpy(write, th_open, th_open_len);
|
|
2355
|
+
write += th_open_len;
|
|
2356
|
+
written += th_open_len;
|
|
2357
|
+
|
|
2358
|
+
/* Copy original cell content */
|
|
2359
|
+
const char *p = cell_content_start;
|
|
2360
|
+
while (p < cell_content_end) {
|
|
2361
|
+
*write++ = *p++;
|
|
2362
|
+
written++;
|
|
2363
|
+
}
|
|
2364
|
+
|
|
2365
|
+
/* Write closing </th> */
|
|
2366
|
+
const char *th_close = "</th>";
|
|
2367
|
+
size_t th_close_len = strlen(th_close);
|
|
2368
|
+
memcpy(write, th_close, th_close_len);
|
|
2369
|
+
write += th_close_len;
|
|
2370
|
+
written += th_close_len;
|
|
2371
|
+
|
|
2372
|
+
/* Advance read pointer past original closing tag */
|
|
2373
|
+
read = orig_close;
|
|
2374
|
+
if (strncmp(read, "</td>", 5) == 0 || strncmp(read, "</th>", 5) == 0) {
|
|
2375
|
+
read += 5;
|
|
2376
|
+
}
|
|
2377
|
+
|
|
2378
|
+
col_idx++;
|
|
2379
|
+
prev_cell_matching = matching;
|
|
2380
|
+
continue;
|
|
2381
|
+
}
|
|
2382
|
+
}
|
|
2383
|
+
}
|
|
2384
|
+
|
|
2385
|
+
/* Process cell alignment (check for leading/trailing colons) for cells without spans or row-header conversion */
|
|
2386
|
+
/* Only process if alignment colons were detected in the early exit check */
|
|
2387
|
+
if (should_process_alignment) {
|
|
2388
|
+
/* Fast inline check: look for '>' in opening tag (avoid strchr if possible) */
|
|
2389
|
+
const char *tag_end = read;
|
|
2390
|
+
int tag_len = 0;
|
|
2391
|
+
while (tag_len < 100 && *tag_end && *tag_end != '>') {
|
|
2392
|
+
tag_end++;
|
|
2393
|
+
tag_len++;
|
|
2394
|
+
}
|
|
2395
|
+
if (*tag_end != '>') {
|
|
2396
|
+
/* Tag too long or malformed, skip alignment processing */
|
|
2397
|
+
col_idx++;
|
|
2398
|
+
prev_cell_matching = matching;
|
|
2399
|
+
/* Continue with normal character-by-character processing */
|
|
2400
|
+
} else {
|
|
2401
|
+
/* Check if cell already has align attribute (from column alignment) */
|
|
2402
|
+
/* We still need to check for per-cell alignment colons to override column alignment */
|
|
2403
|
+
bool has_align_attr = false;
|
|
2404
|
+
const char *align_attr_start = NULL;
|
|
2405
|
+
const char *align_attr_end = NULL;
|
|
2406
|
+
const char *tag_check = read;
|
|
2407
|
+
|
|
2408
|
+
/* Scan up to tag_end for "align=" */
|
|
2409
|
+
for (int i = 0; i < tag_len && tag_check < tag_end; i++, tag_check++) {
|
|
2410
|
+
if (strncmp(tag_check, "align=", 6) == 0) {
|
|
2411
|
+
has_align_attr = true;
|
|
2412
|
+
align_attr_start = tag_check;
|
|
2413
|
+
/* Find the end of the align attribute value */
|
|
2414
|
+
const char *quote = strchr(tag_check + 6, '"');
|
|
2415
|
+
if (quote) {
|
|
2416
|
+
align_attr_end = strchr(quote + 1, '"');
|
|
2417
|
+
}
|
|
2418
|
+
break;
|
|
2419
|
+
}
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
/* Fast check: look for colon in cell content before extracting full content */
|
|
2423
|
+
/* This avoids expensive strchr/strstr for cells that clearly don't have alignment */
|
|
2424
|
+
bool is_th = strncmp(read, "<th", 3) == 0;
|
|
2425
|
+
const char *close_tag = is_th ? "</th>" : "</td>";
|
|
2426
|
+
/* Use inline search for close tag (faster than strstr for short distances) */
|
|
2427
|
+
const char *close_tag_pos = tag_end + 1;
|
|
2428
|
+
bool found_close = false;
|
|
2429
|
+
/* Limit search to first 500 chars to avoid scanning huge cells */
|
|
2430
|
+
for (int i = 0; i < 500 && *close_tag_pos; i++) {
|
|
2431
|
+
if (strncmp(close_tag_pos, close_tag, 5) == 0) {
|
|
2432
|
+
found_close = true;
|
|
2433
|
+
break;
|
|
2434
|
+
}
|
|
2435
|
+
close_tag_pos++;
|
|
2436
|
+
}
|
|
2437
|
+
|
|
2438
|
+
if (!found_close) {
|
|
2439
|
+
/* Close tag not found nearby, skip alignment processing */
|
|
2440
|
+
col_idx++;
|
|
2441
|
+
prev_cell_matching = matching;
|
|
2442
|
+
/* Continue with normal character-by-character processing */
|
|
2443
|
+
} else if (close_tag_pos && close_tag_pos > tag_end + 1) {
|
|
2444
|
+
/* Quick check for colon - after whitespace, alignment colons are only at start or end */
|
|
2445
|
+
/* This is much faster than scanning 50+ characters */
|
|
2446
|
+
bool has_colon = false;
|
|
2447
|
+
const char *content_start = tag_end + 1;
|
|
2448
|
+
const char *content_end = close_tag_pos;
|
|
2449
|
+
|
|
2450
|
+
/* Skip leading whitespace and check first non-whitespace character */
|
|
2451
|
+
const char *first_char = content_start;
|
|
2452
|
+
while (first_char < content_end && isspace((unsigned char)*first_char)) {
|
|
2453
|
+
first_char++;
|
|
2454
|
+
}
|
|
2455
|
+
if (first_char < content_end && *first_char == ':') {
|
|
2456
|
+
has_colon = true;
|
|
2457
|
+
}
|
|
2458
|
+
|
|
2459
|
+
/* If not found, skip trailing whitespace and check last non-whitespace character */
|
|
2460
|
+
if (!has_colon) {
|
|
2461
|
+
const char *last_char = content_end - 1;
|
|
2462
|
+
while (last_char > content_start && isspace((unsigned char)*last_char)) {
|
|
2463
|
+
last_char--;
|
|
2464
|
+
}
|
|
2465
|
+
if (last_char >= content_start && *last_char == ':') {
|
|
2466
|
+
has_colon = true;
|
|
2467
|
+
}
|
|
2468
|
+
}
|
|
2469
|
+
|
|
2470
|
+
if (!has_colon) {
|
|
2471
|
+
/* No colon found, skip alignment processing for this cell */
|
|
2472
|
+
col_idx++;
|
|
2473
|
+
prev_cell_matching = matching;
|
|
2474
|
+
/* Continue with normal character-by-character processing */
|
|
2475
|
+
} else {
|
|
2476
|
+
/* Colon found, extract full content and process alignment */
|
|
2477
|
+
const char *cell_content_start = tag_end + 1;
|
|
2478
|
+
const char *cell_content_end = close_tag_pos;
|
|
2479
|
+
|
|
2480
|
+
/* Quick check: if content is too long, skip alignment processing to avoid timeout */
|
|
2481
|
+
size_t content_len = cell_content_end - cell_content_start;
|
|
2482
|
+
if (content_len > 10000) {
|
|
2483
|
+
/* Content too long, skip alignment processing for this cell */
|
|
2484
|
+
col_idx++;
|
|
2485
|
+
prev_cell_matching = matching;
|
|
2486
|
+
/* Continue with normal character-by-character processing */
|
|
2487
|
+
} else {
|
|
2488
|
+
/* Check for alignment colons */
|
|
2489
|
+
const char *content_start = cell_content_start;
|
|
2490
|
+
const char *content_end = cell_content_end;
|
|
2491
|
+
char *align_style = NULL;
|
|
2492
|
+
|
|
2493
|
+
if (process_cell_alignment(&content_start, &content_end, &align_style)) {
|
|
2494
|
+
/* Per-cell alignment detected - override column alignment */
|
|
2495
|
+
/* Copy the opening tag, but remove existing align attribute if present */
|
|
2496
|
+
if (has_align_attr && align_attr_start && align_attr_end) {
|
|
2497
|
+
/* Copy up to align attribute */
|
|
2498
|
+
while (read < align_attr_start) {
|
|
2499
|
+
*write++ = *read++;
|
|
2500
|
+
}
|
|
2501
|
+
/* Skip the align attribute (including quotes) */
|
|
2502
|
+
read = align_attr_end + 1;
|
|
2503
|
+
/* Remove any trailing space before '>' */
|
|
2504
|
+
while (read < tag_end && (*read == ' ' || *read == '\t')) {
|
|
2505
|
+
read++;
|
|
2506
|
+
}
|
|
2507
|
+
} else {
|
|
2508
|
+
/* Copy the opening tag up to '>' */
|
|
2509
|
+
while (*read && *read != '>') {
|
|
2510
|
+
*write++ = *read++;
|
|
2511
|
+
}
|
|
2512
|
+
}
|
|
2513
|
+
|
|
2514
|
+
/* Add style attribute before closing '>' */
|
|
2515
|
+
if (*read == '>') {
|
|
2516
|
+
/* Add style attribute (overrides column alignment) */
|
|
2517
|
+
*write++ = ' ';
|
|
2518
|
+
*write++ = 's';
|
|
2519
|
+
*write++ = 't';
|
|
2520
|
+
*write++ = 'y';
|
|
2521
|
+
*write++ = 'l';
|
|
2522
|
+
*write++ = 'e';
|
|
2523
|
+
*write++ = '=';
|
|
2524
|
+
*write++ = '"';
|
|
2525
|
+
const char *style_str = align_style;
|
|
2526
|
+
while (*style_str) {
|
|
2527
|
+
*write++ = *style_str++;
|
|
2528
|
+
}
|
|
2529
|
+
*write++ = '"';
|
|
2530
|
+
free(align_style);
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
/* Copy the '>' */
|
|
2534
|
+
if (*read == '>') {
|
|
2535
|
+
*write++ = *read++;
|
|
2536
|
+
}
|
|
2537
|
+
|
|
2538
|
+
/* Copy modified content (with colons removed) */
|
|
2539
|
+
while (content_start < content_end) {
|
|
2540
|
+
*write++ = *content_start++;
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
/* Skip original content and write closing tag */
|
|
2544
|
+
read = cell_content_end;
|
|
2545
|
+
memcpy(write, close_tag, 5);
|
|
2546
|
+
write += 5;
|
|
2547
|
+
read += 5;
|
|
2548
|
+
|
|
2549
|
+
col_idx++;
|
|
2550
|
+
prev_cell_matching = matching;
|
|
2551
|
+
continue;
|
|
2552
|
+
}
|
|
2553
|
+
/* If alignment processing failed, free align_style if it was allocated */
|
|
2554
|
+
if (align_style) {
|
|
2555
|
+
free(align_style);
|
|
2556
|
+
}
|
|
2557
|
+
}
|
|
2558
|
+
}
|
|
2559
|
+
}
|
|
2560
|
+
}
|
|
2561
|
+
} /* End of should_process_alignment check */
|
|
2562
|
+
|
|
2563
|
+
/* For normal cells (no special processing), copy the entire cell tag and content.
|
|
2564
|
+
* This ensures cells are properly output while still advancing read correctly. */
|
|
2565
|
+
if (!should_remove_cell && !matching && !make_row_header) {
|
|
2566
|
+
const char *cell_close_tag = is_th ? "</th>" : "</td>";
|
|
2567
|
+
/* Copy opening tag */
|
|
2568
|
+
while (*read && *read != '>') {
|
|
2569
|
+
*write++ = *read++;
|
|
2570
|
+
written++;
|
|
2571
|
+
}
|
|
2572
|
+
if (*read == '>') {
|
|
2573
|
+
*write++ = *read++;
|
|
2574
|
+
written++;
|
|
2575
|
+
}
|
|
2576
|
+
/* Copy content until closing tag */
|
|
2577
|
+
while (*read && strncmp(read, cell_close_tag, 5) != 0) {
|
|
2578
|
+
*write++ = *read++;
|
|
2579
|
+
written++;
|
|
2580
|
+
}
|
|
2581
|
+
/* Copy closing tag */
|
|
2582
|
+
if (strncmp(read, cell_close_tag, 5) == 0) {
|
|
2583
|
+
for (int i = 0; i < 5; i++) {
|
|
2584
|
+
*write++ = *read++;
|
|
2585
|
+
written++;
|
|
2586
|
+
}
|
|
2587
|
+
}
|
|
2588
|
+
col_idx++;
|
|
2589
|
+
prev_cell_matching = matching;
|
|
2590
|
+
continue;
|
|
2591
|
+
}
|
|
2592
|
+
|
|
2593
|
+
col_idx++;
|
|
2594
|
+
prev_cell_matching = matching; /* Track this cell for next cell's colspan check */
|
|
2595
|
+
}
|
|
2596
|
+
|
|
2597
|
+
/* Copy character */
|
|
2598
|
+
*write++ = *read++;
|
|
2599
|
+
written++;
|
|
2600
|
+
}
|
|
2601
|
+
|
|
2602
|
+
*write = '\0';
|
|
2603
|
+
|
|
2604
|
+
/* Clean up all_cells list */
|
|
2605
|
+
while (all_cells) {
|
|
2606
|
+
all_cell *next = all_cells->next;
|
|
2607
|
+
free(all_cells);
|
|
2608
|
+
all_cells = next;
|
|
2609
|
+
}
|
|
2610
|
+
|
|
2611
|
+
/* Clean up attributes list */
|
|
2612
|
+
while (attrs) {
|
|
2613
|
+
cell_attr *next = attrs->next;
|
|
2614
|
+
free(attrs->attributes);
|
|
2615
|
+
if (attrs->cell_text) free(attrs->cell_text);
|
|
2616
|
+
free(attrs);
|
|
2617
|
+
attrs = next;
|
|
2618
|
+
}
|
|
2619
|
+
|
|
2620
|
+
/* Clean up captions list */
|
|
2621
|
+
while (captions) {
|
|
2622
|
+
table_caption *next = captions->next;
|
|
2623
|
+
free(captions->caption);
|
|
2624
|
+
free(captions);
|
|
2625
|
+
captions = next;
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
/* Clean up paragraphs to remove list */
|
|
2629
|
+
while (paras_to_remove) {
|
|
2630
|
+
para_to_remove *next = paras_to_remove->next;
|
|
2631
|
+
free(paras_to_remove->text_fingerprint);
|
|
2632
|
+
free(paras_to_remove);
|
|
2633
|
+
paras_to_remove = next;
|
|
2634
|
+
}
|
|
2635
|
+
|
|
2636
|
+
done:
|
|
2637
|
+
/* Replace placeholder for escaped \<< with << so literal << displays correctly. */
|
|
2638
|
+
if (output) {
|
|
2639
|
+
const char *replacement = ESCAPED_LTLT_REPLACEMENT;
|
|
2640
|
+
const size_t replacement_len = ESCAPED_LTLT_REPLACEMENT_LEN;
|
|
2641
|
+
|
|
2642
|
+
size_t out_len = strlen(output);
|
|
2643
|
+
size_t n = 0;
|
|
2644
|
+
const char *p = output;
|
|
2645
|
+
while (p + ESCAPED_LTLT_PLACEHOLDER_LEN <= output + out_len) {
|
|
2646
|
+
if (memcmp(p, ESCAPED_LTLT_PLACEHOLDER, ESCAPED_LTLT_PLACEHOLDER_LEN) == 0) {
|
|
2647
|
+
n++;
|
|
2648
|
+
p += ESCAPED_LTLT_PLACEHOLDER_LEN;
|
|
2649
|
+
} else {
|
|
2650
|
+
p++;
|
|
2651
|
+
}
|
|
2652
|
+
}
|
|
2653
|
+
if (n > 0) {
|
|
2654
|
+
size_t new_len = out_len - n * ESCAPED_LTLT_PLACEHOLDER_LEN + n * replacement_len;
|
|
2655
|
+
char *new_out = malloc(new_len + 1);
|
|
2656
|
+
if (new_out) {
|
|
2657
|
+
char *w = new_out;
|
|
2658
|
+
const char *r = output;
|
|
2659
|
+
while (r < output + out_len) {
|
|
2660
|
+
if (r + ESCAPED_LTLT_PLACEHOLDER_LEN <= output + out_len &&
|
|
2661
|
+
memcmp(r, ESCAPED_LTLT_PLACEHOLDER, ESCAPED_LTLT_PLACEHOLDER_LEN) == 0) {
|
|
2662
|
+
memcpy(w, replacement, replacement_len);
|
|
2663
|
+
w += replacement_len;
|
|
2664
|
+
r += ESCAPED_LTLT_PLACEHOLDER_LEN;
|
|
2665
|
+
} else {
|
|
2666
|
+
*w++ = *r++;
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
*w = '\0';
|
|
2670
|
+
free(output);
|
|
2671
|
+
output = new_out;
|
|
2672
|
+
out_len = new_len;
|
|
2673
|
+
}
|
|
2674
|
+
}
|
|
2675
|
+
|
|
2676
|
+
}
|
|
2677
|
+
return output;
|
|
2678
|
+
}
|
|
2679
|
+
|