newstile 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +1 -0
- data/CONTRIBUTERS +6 -0
- data/COPYING +24 -0
- data/ChangeLog +5489 -0
- data/GPL +674 -0
- data/README +31 -0
- data/Rakefile +342 -0
- data/VERSION +1 -0
- data/benchmark/benchmark.rb +34 -0
- data/benchmark/generate_data.rb +112 -0
- data/benchmark/historic-jruby-1.4.0.dat +7 -0
- data/benchmark/historic-ruby-1.8.6.dat +7 -0
- data/benchmark/historic-ruby-1.8.7.dat +7 -0
- data/benchmark/historic-ruby-1.9.1p243.dat +7 -0
- data/benchmark/historic-ruby-1.9.2dev.dat +7 -0
- data/benchmark/mdbasics.text +306 -0
- data/benchmark/mdsyntax.text +888 -0
- data/benchmark/static-jruby-1.4.0.dat +7 -0
- data/benchmark/static-ruby-1.8.6.dat +7 -0
- data/benchmark/static-ruby-1.8.7.dat +7 -0
- data/benchmark/static-ruby-1.9.1p243.dat +7 -0
- data/benchmark/static-ruby-1.9.2dev.dat +7 -0
- data/benchmark/testing.sh +9 -0
- data/benchmark/timing.sh +10 -0
- data/bin/newstile +82 -0
- data/data/newstile/document.html +18 -0
- data/data/newstile/document.latex +43 -0
- data/doc/default.scss.css +519 -0
- data/doc/default.template +80 -0
- data/doc/documentation.page +72 -0
- data/doc/index.page +96 -0
- data/doc/installation.page +90 -0
- data/doc/links.markdown +6 -0
- data/doc/news.feed +10 -0
- data/doc/news.page +28 -0
- data/doc/quickref.page +564 -0
- data/doc/syntax.page +1615 -0
- data/doc/tests.page +51 -0
- data/doc/virtual +2 -0
- data/lib/newstile.rb +23 -0
- data/lib/newstile/compatibility.rb +34 -0
- data/lib/newstile/converter.rb +43 -0
- data/lib/newstile/converter/base.rb +111 -0
- data/lib/newstile/converter/html.rb +405 -0
- data/lib/newstile/converter/latex.rb +577 -0
- data/lib/newstile/converter/markdown.rb +426 -0
- data/lib/newstile/converter/newstile.rb +426 -0
- data/lib/newstile/document.rb +168 -0
- data/lib/newstile/error.rb +27 -0
- data/lib/newstile/options.rb +296 -0
- data/lib/newstile/parser.rb +39 -0
- data/lib/newstile/parser/base.rb +94 -0
- data/lib/newstile/parser/html.rb +499 -0
- data/lib/newstile/parser/newstile.rb +325 -0
- data/lib/newstile/parser/newstile/abbreviation.rb +66 -0
- data/lib/newstile/parser/newstile/attribute_list.rb +111 -0
- data/lib/newstile/parser/newstile/autolink.rb +54 -0
- data/lib/newstile/parser/newstile/blank_line.rb +43 -0
- data/lib/newstile/parser/newstile/block_boundary.rb +46 -0
- data/lib/newstile/parser/newstile/blockquote.rb +63 -0
- data/lib/newstile/parser/newstile/codeblock.rb +60 -0
- data/lib/newstile/parser/newstile/codespan.rb +57 -0
- data/lib/newstile/parser/newstile/emphasis.rb +70 -0
- data/lib/newstile/parser/newstile/eob.rb +39 -0
- data/lib/newstile/parser/newstile/escaped_chars.rb +38 -0
- data/lib/newstile/parser/newstile/extension.rb +116 -0
- data/lib/newstile/parser/newstile/footnote.rb +74 -0
- data/lib/newstile/parser/newstile/header.rb +84 -0
- data/lib/newstile/parser/newstile/horizontal_rule.rb +39 -0
- data/lib/newstile/parser/newstile/html.rb +175 -0
- data/lib/newstile/parser/newstile/html_entity.rb +39 -0
- data/lib/newstile/parser/newstile/line_break.rb +38 -0
- data/lib/newstile/parser/newstile/link.rb +177 -0
- data/lib/newstile/parser/newstile/list.rb +239 -0
- data/lib/newstile/parser/newstile/math.rb +64 -0
- data/lib/newstile/parser/newstile/paragraph.rb +55 -0
- data/lib/newstile/parser/newstile/smart_quotes.rb +214 -0
- data/lib/newstile/parser/newstile/table.rb +134 -0
- data/lib/newstile/parser/newstile/typographic_symbol.rb +54 -0
- data/lib/newstile/utils.rb +37 -0
- data/lib/newstile/utils/entities.rb +336 -0
- data/lib/newstile/utils/html.rb +75 -0
- data/lib/newstile/utils/ordered_hash.rb +79 -0
- data/lib/newstile/version.rb +28 -0
- data/man/man1/newstile.1 +246 -0
- data/setup.rb +1585 -0
- data/test/run_tests.rb +59 -0
- data/test/test_files.rb +162 -0
- data/test/testcases/block/01_blank_line/spaces.html +1 -0
- data/test/testcases/block/01_blank_line/spaces.text +3 -0
- data/test/testcases/block/01_blank_line/tabs.html +1 -0
- data/test/testcases/block/01_blank_line/tabs.text +6 -0
- data/test/testcases/block/02_eob/beginning.html +1 -0
- data/test/testcases/block/02_eob/beginning.text +3 -0
- data/test/testcases/block/02_eob/end.html +1 -0
- data/test/testcases/block/02_eob/end.text +3 -0
- data/test/testcases/block/02_eob/middle.html +1 -0
- data/test/testcases/block/02_eob/middle.text +5 -0
- data/test/testcases/block/03_paragraph/indented.html +18 -0
- data/test/testcases/block/03_paragraph/indented.text +19 -0
- data/test/testcases/block/03_paragraph/no_newline_at_end.html +5 -0
- data/test/testcases/block/03_paragraph/no_newline_at_end.text +5 -0
- data/test/testcases/block/03_paragraph/one_para.html +1 -0
- data/test/testcases/block/03_paragraph/one_para.text +1 -0
- data/test/testcases/block/03_paragraph/two_para.html +4 -0
- data/test/testcases/block/03_paragraph/two_para.text +4 -0
- data/test/testcases/block/04_header/atx_header.html +37 -0
- data/test/testcases/block/04_header/atx_header.text +34 -0
- data/test/testcases/block/04_header/atx_header_no_newline_at_end.html +1 -0
- data/test/testcases/block/04_header/atx_header_no_newline_at_end.text +1 -0
- data/test/testcases/block/04_header/setext_header.html +30 -0
- data/test/testcases/block/04_header/setext_header.html.19 +30 -0
- data/test/testcases/block/04_header/setext_header.text +36 -0
- data/test/testcases/block/04_header/setext_header_no_newline_at_end.html +1 -0
- data/test/testcases/block/04_header/setext_header_no_newline_at_end.text +2 -0
- data/test/testcases/block/04_header/with_auto_id_prefix.html +3 -0
- data/test/testcases/block/04_header/with_auto_id_prefix.options +2 -0
- data/test/testcases/block/04_header/with_auto_id_prefix.text +3 -0
- data/test/testcases/block/04_header/with_auto_ids.html +17 -0
- data/test/testcases/block/04_header/with_auto_ids.options +1 -0
- data/test/testcases/block/04_header/with_auto_ids.text +19 -0
- data/test/testcases/block/05_blockquote/indented.html +25 -0
- data/test/testcases/block/05_blockquote/indented.text +14 -0
- data/test/testcases/block/05_blockquote/lazy.html +34 -0
- data/test/testcases/block/05_blockquote/lazy.text +20 -0
- data/test/testcases/block/05_blockquote/nested.html +10 -0
- data/test/testcases/block/05_blockquote/nested.text +6 -0
- data/test/testcases/block/05_blockquote/no_newline_at_end.html +4 -0
- data/test/testcases/block/05_blockquote/no_newline_at_end.text +2 -0
- data/test/testcases/block/05_blockquote/with_code_blocks.html +15 -0
- data/test/testcases/block/05_blockquote/with_code_blocks.text +11 -0
- data/test/testcases/block/06_codeblock/error.html +4 -0
- data/test/testcases/block/06_codeblock/error.text +4 -0
- data/test/testcases/block/06_codeblock/lazy.html +4 -0
- data/test/testcases/block/06_codeblock/lazy.text +5 -0
- data/test/testcases/block/06_codeblock/no_newline_at_end.html +2 -0
- data/test/testcases/block/06_codeblock/no_newline_at_end.text +1 -0
- data/test/testcases/block/06_codeblock/no_newline_at_end_1.html +2 -0
- data/test/testcases/block/06_codeblock/no_newline_at_end_1.text +2 -0
- data/test/testcases/block/06_codeblock/normal.html +13 -0
- data/test/testcases/block/06_codeblock/normal.text +10 -0
- data/test/testcases/block/06_codeblock/tilde_syntax.html +7 -0
- data/test/testcases/block/06_codeblock/tilde_syntax.text +9 -0
- data/test/testcases/block/06_codeblock/whitespace.html +3 -0
- data/test/testcases/block/06_codeblock/whitespace.text +3 -0
- data/test/testcases/block/06_codeblock/with_blank_line.html +13 -0
- data/test/testcases/block/06_codeblock/with_blank_line.text +12 -0
- data/test/testcases/block/06_codeblock/with_eob_marker.html +6 -0
- data/test/testcases/block/06_codeblock/with_eob_marker.text +5 -0
- data/test/testcases/block/06_codeblock/with_ial.html +6 -0
- data/test/testcases/block/06_codeblock/with_ial.text +5 -0
- data/test/testcases/block/07_horizontal_rule/error.html +7 -0
- data/test/testcases/block/07_horizontal_rule/error.html.19 +7 -0
- data/test/testcases/block/07_horizontal_rule/error.text +7 -0
- data/test/testcases/block/07_horizontal_rule/normal.html +17 -0
- data/test/testcases/block/07_horizontal_rule/normal.text +17 -0
- data/test/testcases/block/07_horizontal_rule/sepspaces.html +3 -0
- data/test/testcases/block/07_horizontal_rule/sepspaces.text +3 -0
- data/test/testcases/block/07_horizontal_rule/septabs.html +3 -0
- data/test/testcases/block/07_horizontal_rule/septabs.text +3 -0
- data/test/testcases/block/08_list/escaping.html +17 -0
- data/test/testcases/block/08_list/escaping.text +17 -0
- data/test/testcases/block/08_list/item_ial.html +7 -0
- data/test/testcases/block/08_list/item_ial.text +5 -0
- data/test/testcases/block/08_list/lazy.html +39 -0
- data/test/testcases/block/08_list/lazy.text +29 -0
- data/test/testcases/block/08_list/list_and_hr.html +9 -0
- data/test/testcases/block/08_list/list_and_hr.text +5 -0
- data/test/testcases/block/08_list/list_and_others.html +40 -0
- data/test/testcases/block/08_list/list_and_others.text +26 -0
- data/test/testcases/block/08_list/mixed.html +117 -0
- data/test/testcases/block/08_list/mixed.text +66 -0
- data/test/testcases/block/08_list/nested.html +17 -0
- data/test/testcases/block/08_list/nested.text +7 -0
- data/test/testcases/block/08_list/other_first_element.html +39 -0
- data/test/testcases/block/08_list/other_first_element.text +18 -0
- data/test/testcases/block/08_list/simple_ol.html +19 -0
- data/test/testcases/block/08_list/simple_ol.text +13 -0
- data/test/testcases/block/08_list/simple_ul.html +48 -0
- data/test/testcases/block/08_list/simple_ul.text +36 -0
- data/test/testcases/block/08_list/single_item.html +3 -0
- data/test/testcases/block/08_list/single_item.text +1 -0
- data/test/testcases/block/08_list/special_cases.html +55 -0
- data/test/testcases/block/08_list/special_cases.text +35 -0
- data/test/testcases/block/09_html/comment.html +18 -0
- data/test/testcases/block/09_html/comment.text +15 -0
- data/test/testcases/block/09_html/content_model/deflists.html +6 -0
- data/test/testcases/block/09_html/content_model/deflists.options +1 -0
- data/test/testcases/block/09_html/content_model/deflists.text +6 -0
- data/test/testcases/block/09_html/content_model/tables.html +14 -0
- data/test/testcases/block/09_html/content_model/tables.options +1 -0
- data/test/testcases/block/09_html/content_model/tables.text +14 -0
- data/test/testcases/block/09_html/html_and_codeblocks.html +15 -0
- data/test/testcases/block/09_html/html_and_codeblocks.options +1 -0
- data/test/testcases/block/09_html/html_and_codeblocks.text +13 -0
- data/test/testcases/block/09_html/html_to_native/code.html +10 -0
- data/test/testcases/block/09_html/html_to_native/code.text +9 -0
- data/test/testcases/block/09_html/html_to_native/comment.html +7 -0
- data/test/testcases/block/09_html/html_to_native/comment.text +8 -0
- data/test/testcases/block/09_html/html_to_native/emphasis.html +3 -0
- data/test/testcases/block/09_html/html_to_native/emphasis.text +3 -0
- data/test/testcases/block/09_html/html_to_native/entity.html +1 -0
- data/test/testcases/block/09_html/html_to_native/entity.text +1 -0
- data/test/testcases/block/09_html/html_to_native/header.html +6 -0
- data/test/testcases/block/09_html/html_to_native/header.options +2 -0
- data/test/testcases/block/09_html/html_to_native/header.text +6 -0
- data/test/testcases/block/09_html/html_to_native/list_dl.html +8 -0
- data/test/testcases/block/09_html/html_to_native/list_dl.text +8 -0
- data/test/testcases/block/09_html/html_to_native/list_ol.html +15 -0
- data/test/testcases/block/09_html/html_to_native/list_ol.text +17 -0
- data/test/testcases/block/09_html/html_to_native/list_ul.html +19 -0
- data/test/testcases/block/09_html/html_to_native/list_ul.text +22 -0
- data/test/testcases/block/09_html/html_to_native/options +1 -0
- data/test/testcases/block/09_html/html_to_native/paragraph.html +3 -0
- data/test/testcases/block/09_html/html_to_native/paragraph.text +4 -0
- data/test/testcases/block/09_html/html_to_native/table_normal.html +14 -0
- data/test/testcases/block/09_html/html_to_native/table_normal.text +12 -0
- data/test/testcases/block/09_html/html_to_native/table_simple.html +48 -0
- data/test/testcases/block/09_html/html_to_native/table_simple.text +56 -0
- data/test/testcases/block/09_html/html_to_native/typography.html +1 -0
- data/test/testcases/block/09_html/html_to_native/typography.html.19 +1 -0
- data/test/testcases/block/09_html/html_to_native/typography.text +1 -0
- data/test/testcases/block/09_html/invalid_html_1.html +5 -0
- data/test/testcases/block/09_html/invalid_html_1.text +5 -0
- data/test/testcases/block/09_html/invalid_html_2.html +5 -0
- data/test/testcases/block/09_html/invalid_html_2.text +5 -0
- data/test/testcases/block/09_html/markdown_attr.html +38 -0
- data/test/testcases/block/09_html/markdown_attr.text +38 -0
- data/test/testcases/block/09_html/not_parsed.html +24 -0
- data/test/testcases/block/09_html/not_parsed.text +24 -0
- data/test/testcases/block/09_html/parse_as_raw.html +30 -0
- data/test/testcases/block/09_html/parse_as_raw.options +1 -0
- data/test/testcases/block/09_html/parse_as_raw.text +29 -0
- data/test/testcases/block/09_html/parse_as_span.html +12 -0
- data/test/testcases/block/09_html/parse_as_span.options +1 -0
- data/test/testcases/block/09_html/parse_as_span.text +9 -0
- data/test/testcases/block/09_html/parse_block_html.html +21 -0
- data/test/testcases/block/09_html/parse_block_html.options +1 -0
- data/test/testcases/block/09_html/parse_block_html.text +17 -0
- data/test/testcases/block/09_html/processing_instruction.html +13 -0
- data/test/testcases/block/09_html/processing_instruction.text +12 -0
- data/test/testcases/block/09_html/simple.html +64 -0
- data/test/testcases/block/09_html/simple.html.19 +64 -0
- data/test/testcases/block/09_html/simple.options +1 -0
- data/test/testcases/block/09_html/simple.text +59 -0
- data/test/testcases/block/10_ald/simple.html +2 -0
- data/test/testcases/block/10_ald/simple.text +8 -0
- data/test/testcases/block/11_ial/auto_id_and_ial.html +1 -0
- data/test/testcases/block/11_ial/auto_id_and_ial.options +1 -0
- data/test/testcases/block/11_ial/auto_id_and_ial.text +2 -0
- data/test/testcases/block/11_ial/simple.html +25 -0
- data/test/testcases/block/11_ial/simple.text +34 -0
- data/test/testcases/block/12_extension/comment.html +8 -0
- data/test/testcases/block/12_extension/comment.text +12 -0
- data/test/testcases/block/12_extension/ignored.html +8 -0
- data/test/testcases/block/12_extension/ignored.text +8 -0
- data/test/testcases/block/12_extension/nomarkdown.html +10 -0
- data/test/testcases/block/12_extension/nomarkdown.kramdown +20 -0
- data/test/testcases/block/12_extension/nomarkdown.latex +13 -0
- data/test/testcases/block/12_extension/nomarkdown.text +21 -0
- data/test/testcases/block/12_extension/options.html +21 -0
- data/test/testcases/block/12_extension/options.text +21 -0
- data/test/testcases/block/12_extension/options2.html +10 -0
- data/test/testcases/block/12_extension/options2.text +5 -0
- data/test/testcases/block/12_extension/options3.html +7 -0
- data/test/testcases/block/12_extension/options3.text +7 -0
- data/test/testcases/block/13_definition_list/definition_at_beginning.html +1 -0
- data/test/testcases/block/13_definition_list/definition_at_beginning.text +1 -0
- data/test/testcases/block/13_definition_list/item_ial.html +12 -0
- data/test/testcases/block/13_definition_list/item_ial.text +8 -0
- data/test/testcases/block/13_definition_list/multiple_terms.html +13 -0
- data/test/testcases/block/13_definition_list/multiple_terms.text +10 -0
- data/test/testcases/block/13_definition_list/no_def_list.html +2 -0
- data/test/testcases/block/13_definition_list/no_def_list.text +2 -0
- data/test/testcases/block/13_definition_list/para_wrapping.html +10 -0
- data/test/testcases/block/13_definition_list/para_wrapping.text +6 -0
- data/test/testcases/block/13_definition_list/separated_by_eob.html +8 -0
- data/test/testcases/block/13_definition_list/separated_by_eob.text +5 -0
- data/test/testcases/block/13_definition_list/simple.html +8 -0
- data/test/testcases/block/13_definition_list/simple.text +7 -0
- data/test/testcases/block/13_definition_list/styled_terms.html +4 -0
- data/test/testcases/block/13_definition_list/styled_terms.text +2 -0
- data/test/testcases/block/13_definition_list/too_much_space.html +3 -0
- data/test/testcases/block/13_definition_list/too_much_space.text +4 -0
- data/test/testcases/block/13_definition_list/with_blocks.html +38 -0
- data/test/testcases/block/13_definition_list/with_blocks.text +24 -0
- data/test/testcases/block/14_table/errors.html +8 -0
- data/test/testcases/block/14_table/errors.text +9 -0
- data/test/testcases/block/14_table/footer.html +65 -0
- data/test/testcases/block/14_table/footer.text +25 -0
- data/test/testcases/block/14_table/header.html +103 -0
- data/test/testcases/block/14_table/header.text +32 -0
- data/test/testcases/block/14_table/no_table.html +3 -0
- data/test/testcases/block/14_table/no_table.text +3 -0
- data/test/testcases/block/14_table/simple.html +139 -0
- data/test/testcases/block/14_table/simple.text +38 -0
- data/test/testcases/block/15_math/normal.html +26 -0
- data/test/testcases/block/15_math/normal.text +28 -0
- data/test/testcases/block/16_toc/no_toc_depth.html +33 -0
- data/test/testcases/block/16_toc/no_toc_depth.options +1 -0
- data/test/testcases/block/16_toc/no_toc_depth.text +16 -0
- data/test/testcases/block/16_toc/toc_depth_2.html +24 -0
- data/test/testcases/block/16_toc/toc_depth_2.options +1 -0
- data/test/testcases/block/16_toc/toc_depth_2.text +16 -0
- data/test/testcases/encoding.html +46 -0
- data/test/testcases/encoding.text +28 -0
- data/test/testcases/span/01_link/empty.html +5 -0
- data/test/testcases/span/01_link/empty.text +5 -0
- data/test/testcases/span/01_link/image_in_a.html +5 -0
- data/test/testcases/span/01_link/image_in_a.text +5 -0
- data/test/testcases/span/01_link/imagelinks.html +14 -0
- data/test/testcases/span/01_link/imagelinks.text +16 -0
- data/test/testcases/span/01_link/inline.html +46 -0
- data/test/testcases/span/01_link/inline.html.19 +46 -0
- data/test/testcases/span/01_link/inline.text +48 -0
- data/test/testcases/span/01_link/link_defs.html +9 -0
- data/test/testcases/span/01_link/link_defs.text +26 -0
- data/test/testcases/span/01_link/links_with_angle_brackets.html +3 -0
- data/test/testcases/span/01_link/links_with_angle_brackets.text +3 -0
- data/test/testcases/span/01_link/reference.html +35 -0
- data/test/testcases/span/01_link/reference.html.19 +35 -0
- data/test/testcases/span/01_link/reference.text +47 -0
- data/test/testcases/span/02_emphasis/empty.html +3 -0
- data/test/testcases/span/02_emphasis/empty.text +3 -0
- data/test/testcases/span/02_emphasis/errors.html +9 -0
- data/test/testcases/span/02_emphasis/errors.text +9 -0
- data/test/testcases/span/02_emphasis/nesting.html +38 -0
- data/test/testcases/span/02_emphasis/nesting.text +33 -0
- data/test/testcases/span/02_emphasis/normal.html +46 -0
- data/test/testcases/span/02_emphasis/normal.text +46 -0
- data/test/testcases/span/03_codespan/empty.html +5 -0
- data/test/testcases/span/03_codespan/empty.text +5 -0
- data/test/testcases/span/03_codespan/errors.html +1 -0
- data/test/testcases/span/03_codespan/errors.text +1 -0
- data/test/testcases/span/03_codespan/highlighting.html +1 -0
- data/test/testcases/span/03_codespan/highlighting.text +1 -0
- data/test/testcases/span/03_codespan/normal.html +16 -0
- data/test/testcases/span/03_codespan/normal.text +16 -0
- data/test/testcases/span/04_footnote/definitions.html +17 -0
- data/test/testcases/span/04_footnote/definitions.latex +17 -0
- data/test/testcases/span/04_footnote/definitions.text +24 -0
- data/test/testcases/span/04_footnote/footnote_nr.html +12 -0
- data/test/testcases/span/04_footnote/footnote_nr.latex +2 -0
- data/test/testcases/span/04_footnote/footnote_nr.options +1 -0
- data/test/testcases/span/04_footnote/footnote_nr.text +4 -0
- data/test/testcases/span/04_footnote/markers.html +46 -0
- data/test/testcases/span/04_footnote/markers.latex +23 -0
- data/test/testcases/span/04_footnote/markers.text +26 -0
- data/test/testcases/span/05_html/across_lines.html +1 -0
- data/test/testcases/span/05_html/across_lines.text +2 -0
- data/test/testcases/span/05_html/invalid.html +1 -0
- data/test/testcases/span/05_html/invalid.text +1 -0
- data/test/testcases/span/05_html/link_with_mailto.html +1 -0
- data/test/testcases/span/05_html/link_with_mailto.text +1 -0
- data/test/testcases/span/05_html/markdown_attr.html +6 -0
- data/test/testcases/span/05_html/markdown_attr.text +6 -0
- data/test/testcases/span/05_html/normal.html +30 -0
- data/test/testcases/span/05_html/normal.text +30 -0
- data/test/testcases/span/abbreviations/abbrev.html +8 -0
- data/test/testcases/span/abbreviations/abbrev.text +15 -0
- data/test/testcases/span/abbreviations/abbrev_defs.html +2 -0
- data/test/testcases/span/abbreviations/abbrev_defs.text +5 -0
- data/test/testcases/span/autolinks/url_links.html +12 -0
- data/test/testcases/span/autolinks/url_links.text +12 -0
- data/test/testcases/span/escaped_chars/normal.html +43 -0
- data/test/testcases/span/escaped_chars/normal.text +43 -0
- data/test/testcases/span/extension/comment.html +6 -0
- data/test/testcases/span/extension/comment.text +6 -0
- data/test/testcases/span/extension/ignored.html +1 -0
- data/test/testcases/span/extension/ignored.text +1 -0
- data/test/testcases/span/extension/nomarkdown.html +1 -0
- data/test/testcases/span/extension/nomarkdown.text +1 -0
- data/test/testcases/span/extension/options.html +1 -0
- data/test/testcases/span/extension/options.text +1 -0
- data/test/testcases/span/ial/simple.html +6 -0
- data/test/testcases/span/ial/simple.text +6 -0
- data/test/testcases/span/line_breaks/normal.html +11 -0
- data/test/testcases/span/line_breaks/normal.latex +12 -0
- data/test/testcases/span/line_breaks/normal.text +11 -0
- data/test/testcases/span/math/normal.html +5 -0
- data/test/testcases/span/math/normal.text +5 -0
- data/test/testcases/span/text_substitutions/entities.html +4 -0
- data/test/testcases/span/text_substitutions/entities.options +1 -0
- data/test/testcases/span/text_substitutions/entities.text +4 -0
- data/test/testcases/span/text_substitutions/entities_as_char.html +1 -0
- data/test/testcases/span/text_substitutions/entities_as_char.html.19 +1 -0
- data/test/testcases/span/text_substitutions/entities_as_char.options +1 -0
- data/test/testcases/span/text_substitutions/entities_as_char.text +1 -0
- data/test/testcases/span/text_substitutions/entities_as_input.html +1 -0
- data/test/testcases/span/text_substitutions/entities_as_input.options +1 -0
- data/test/testcases/span/text_substitutions/entities_as_input.text +1 -0
- data/test/testcases/span/text_substitutions/entities_numeric.html +1 -0
- data/test/testcases/span/text_substitutions/entities_numeric.options +1 -0
- data/test/testcases/span/text_substitutions/entities_numeric.text +1 -0
- data/test/testcases/span/text_substitutions/entities_symbolic.html +1 -0
- data/test/testcases/span/text_substitutions/entities_symbolic.options +1 -0
- data/test/testcases/span/text_substitutions/entities_symbolic.text +1 -0
- data/test/testcases/span/text_substitutions/greaterthan.html +1 -0
- data/test/testcases/span/text_substitutions/greaterthan.text +1 -0
- data/test/testcases/span/text_substitutions/lowerthan.html +1 -0
- data/test/testcases/span/text_substitutions/lowerthan.text +1 -0
- data/test/testcases/span/text_substitutions/typography.html +18 -0
- data/test/testcases/span/text_substitutions/typography.html.19 +18 -0
- data/test/testcases/span/text_substitutions/typography.text +18 -0
- metadata +476 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
|
5
|
+
#
|
6
|
+
# This file is part of newstile.
|
7
|
+
#
|
8
|
+
# newstile is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# This program is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module Newstile
|
24
|
+
|
25
|
+
# == Parser Module
|
26
|
+
#
|
27
|
+
# This module contains all available parsers. Currently, there two parsers:
|
28
|
+
#
|
29
|
+
# * Newstile for parsing documents in newstile format
|
30
|
+
# * Html for parsing HTML documents
|
31
|
+
module Parser
|
32
|
+
|
33
|
+
autoload :Base, 'newstile/parser/base'
|
34
|
+
autoload :Newstile, 'newstile/parser/newstile'
|
35
|
+
autoload :Html, 'newstile/parser/html'
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
|
5
|
+
#
|
6
|
+
# This file is part of newstile.
|
7
|
+
#
|
8
|
+
# newstile is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# This program is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module Newstile
|
24
|
+
|
25
|
+
module Parser
|
26
|
+
|
27
|
+
# == Base class for parsers
|
28
|
+
#
|
29
|
+
# This class serves as base class for parsers. It provides common methods that can/should be
|
30
|
+
# used by all parsers, especially by those using StringScanner for parsing.
|
31
|
+
#
|
32
|
+
class Base
|
33
|
+
|
34
|
+
# Initialize the parser with the given Newstile document +doc+.
|
35
|
+
def initialize(doc)
|
36
|
+
@doc = doc
|
37
|
+
@text_type = :text
|
38
|
+
end
|
39
|
+
private_class_method(:new, :allocate)
|
40
|
+
|
41
|
+
# Parse the +source+ string into an element tree, using the information provided by the
|
42
|
+
# Newstile document +doc+.
|
43
|
+
#
|
44
|
+
# Initializes a new instance of the calling class and then calls the #parse method that must
|
45
|
+
# be implemented by each subclass.
|
46
|
+
def self.parse(source, doc)
|
47
|
+
new(doc).parse(source)
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# Add the given warning +text+ to the warning array of the Newstile document.
|
52
|
+
def warning(text)
|
53
|
+
@doc.warnings << text
|
54
|
+
#TODO: add position information
|
55
|
+
end
|
56
|
+
|
57
|
+
# Modify the string +source+ to be usable by the parser.
|
58
|
+
def adapt_source(source)
|
59
|
+
source.gsub(/\r\n?/, "\n").chomp + "\n"
|
60
|
+
end
|
61
|
+
|
62
|
+
# This helper method adds the given +text+ either to the last element in the +tree+ if it is a
|
63
|
+
# +type+ element or creates a new text element with the given +type+.
|
64
|
+
def add_text(text, tree = @tree, type = @text_type)
|
65
|
+
if tree.children.last && tree.children.last.type == type
|
66
|
+
tree.children.last.value << text
|
67
|
+
elsif !text.empty?
|
68
|
+
tree.children << Element.new(type, text)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Extract the part of the StringScanner +srcscan+ backed string specified by the +range+. This
|
73
|
+
# method also works correctly under Ruby 1.9.
|
74
|
+
def extract_string(range, strscan)
|
75
|
+
result = nil
|
76
|
+
if RUBY_VERSION >= '1.9'
|
77
|
+
begin
|
78
|
+
enc = strscan.string.encoding
|
79
|
+
strscan.string.force_encoding('ASCII-8BIT')
|
80
|
+
result = strscan.string[range].force_encoding(enc)
|
81
|
+
ensure
|
82
|
+
strscan.string.force_encoding(enc)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
result = strscan.string[range]
|
86
|
+
end
|
87
|
+
result
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
@@ -0,0 +1,499 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
|
5
|
+
#
|
6
|
+
# This file is part of newstile.
|
7
|
+
#
|
8
|
+
# newstile is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# This program is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'rexml/parsers/baseparser'
|
24
|
+
require 'strscan'
|
25
|
+
|
26
|
+
module Newstile
|
27
|
+
|
28
|
+
module Parser
|
29
|
+
|
30
|
+
# Used for parsing a HTML document.
|
31
|
+
class Html < Base
|
32
|
+
|
33
|
+
# Contains all constants that are used when parsing.
|
34
|
+
module Constants
|
35
|
+
#:stopdoc:
|
36
|
+
# The following regexps are based on the ones used by REXML, with some slight modifications.
|
37
|
+
HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
|
38
|
+
HTML_COMMENT_RE = /<!--(.*?)-->/m
|
39
|
+
HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
|
40
|
+
HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/m
|
41
|
+
HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/m
|
42
|
+
HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::UNAME_STR})\s*>/m
|
43
|
+
HTML_ENTITY_RE = /&([\w:][\-\w\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
|
44
|
+
|
45
|
+
|
46
|
+
HTML_PARSE_AS_BLOCK = %w{applet button blockquote body colgroup dd div dl fieldset form iframe li
|
47
|
+
map noscript object ol table tbody thead tfoot tr td ul}
|
48
|
+
HTML_PARSE_AS_SPAN = %w{a abbr acronym address b bdo big cite caption del dfn dt em
|
49
|
+
h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
|
50
|
+
rp rt rtc ruby samp select small span strong sub sup th tt var}
|
51
|
+
HTML_PARSE_AS_RAW = %w{script math option textarea pre code}
|
52
|
+
|
53
|
+
HTML_PARSE_AS = Hash.new {|h,k| h[k] = :raw}
|
54
|
+
HTML_PARSE_AS_BLOCK.each {|i| HTML_PARSE_AS[i] = :block}
|
55
|
+
HTML_PARSE_AS_SPAN.each {|i| HTML_PARSE_AS[i] = :span}
|
56
|
+
HTML_PARSE_AS_RAW.each {|i| HTML_PARSE_AS[i] = :raw}
|
57
|
+
|
58
|
+
# Some HTML elements like script belong to both categories (i.e. are valid in block and
|
59
|
+
# span HTML) and don't appear therefore!
|
60
|
+
HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
|
61
|
+
ins kbd label option q rb rbc rp rt rtc ruby samp select small span
|
62
|
+
strong sub sup textarea tt var}
|
63
|
+
HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
|
64
|
+
figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend listing menu
|
65
|
+
li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
|
66
|
+
HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
# Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
|
71
|
+
# functionality. The only thing that must be provided by the class are instance variable
|
72
|
+
# <tt>@stack</tt> for storing needed state and <tt>@src</tt> (instance of StringScanner) for
|
73
|
+
# the actual parsing.
|
74
|
+
module Parser
|
75
|
+
|
76
|
+
include Constants
|
77
|
+
|
78
|
+
# Process the HTML start tag that has already be scanned/checked. Does the common processing
|
79
|
+
# steps and then yields to the caller for further processing.
|
80
|
+
def handle_html_start_tag
|
81
|
+
name = @src[1]
|
82
|
+
closed = !@src[4].nil?
|
83
|
+
attrs = Utils::OrderedHash.new
|
84
|
+
@src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr] = val}
|
85
|
+
|
86
|
+
el = Element.new(:html_element, name, attrs, :category => :block)
|
87
|
+
@tree.children << el
|
88
|
+
|
89
|
+
if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
|
90
|
+
warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
|
91
|
+
closed = true
|
92
|
+
end
|
93
|
+
if name == 'script'
|
94
|
+
handle_html_script_tag
|
95
|
+
yield(el, true)
|
96
|
+
else
|
97
|
+
yield(el, closed)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def handle_html_script_tag
|
102
|
+
curpos = @src.pos
|
103
|
+
if result = @src.scan_until(/(?=<\/script\s*>)/m)
|
104
|
+
add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
|
105
|
+
@src.scan(HTML_TAG_CLOSE_RE)
|
106
|
+
else
|
107
|
+
add_text(@src.scan(/.*/m), @tree.children.last, :raw)
|
108
|
+
warning("Found no end tag for 'script' - auto-closing it")
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/
|
113
|
+
|
114
|
+
# Parse raw HTML from the current source position, storing the found elements in +el+.
|
115
|
+
# Parsing continues until one of the following criteria are fulfilled:
|
116
|
+
#
|
117
|
+
# - The end of the document is reached.
|
118
|
+
# - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
|
119
|
+
# element).
|
120
|
+
#
|
121
|
+
# When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
|
122
|
+
# providing the block given to this method.
|
123
|
+
def parse_raw_html(el, &block)
|
124
|
+
@stack.push(@tree)
|
125
|
+
@tree = el
|
126
|
+
|
127
|
+
done = false
|
128
|
+
while !@src.eos? && !done
|
129
|
+
if result = @src.scan_until(HTML_RAW_START)
|
130
|
+
add_text(result, @tree, :text)
|
131
|
+
if result = @src.scan(HTML_COMMENT_RE)
|
132
|
+
@tree.children << Element.new(:xml_comment, result, nil, :category => :block)
|
133
|
+
elsif result = @src.scan(HTML_INSTRUCTION_RE)
|
134
|
+
@tree.children << Element.new(:xml_pi, result, nil, :category => :block)
|
135
|
+
elsif @src.scan(HTML_TAG_RE)
|
136
|
+
handle_html_start_tag(&block)
|
137
|
+
elsif @src.scan(HTML_TAG_CLOSE_RE)
|
138
|
+
if @tree.value == @src[1]
|
139
|
+
done = true
|
140
|
+
else
|
141
|
+
warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")
|
142
|
+
end
|
143
|
+
else
|
144
|
+
add_text(@src.scan(/./), @tree, :text)
|
145
|
+
end
|
146
|
+
else
|
147
|
+
result = @src.scan(/.*/m)
|
148
|
+
add_text(result, @tree, :text)
|
149
|
+
warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
|
150
|
+
done = true
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
@tree = @stack.pop
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
# Converts HTML elements to native elements if possible.
|
161
|
+
class ElementConverter
|
162
|
+
|
163
|
+
include Constants
|
164
|
+
include ::Newstile::Utils::Entities
|
165
|
+
|
166
|
+
REMOVE_TEXT_CHILDREN = %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
|
167
|
+
WRAP_TEXT_CHILDREN = %w{body section nav article aside header footer address div li dd blockquote figure
|
168
|
+
figcaption fieldset form}
|
169
|
+
REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
|
170
|
+
div li dd blockquote figure figcaption td th fieldset form}
|
171
|
+
STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
|
172
|
+
header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
|
173
|
+
SIMPLE_ELEMENTS = %w{em strong blockquote hr br img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
|
174
|
+
|
175
|
+
def initialize(doc)
|
176
|
+
@doc = doc
|
177
|
+
end
|
178
|
+
|
179
|
+
# Convert the element +el+ and its children.
|
180
|
+
def process(el, do_conversion = true, preserve_text = false, parent = nil)
|
181
|
+
case el.type
|
182
|
+
when :xml_comment, :xml_pi, :html_doctype
|
183
|
+
ptype = if parent.nil?
|
184
|
+
'div'
|
185
|
+
else
|
186
|
+
case parent.type
|
187
|
+
when :html_element then parent.value
|
188
|
+
when :code_span then 'code'
|
189
|
+
when :code_block then 'pre'
|
190
|
+
when :header then 'h1'
|
191
|
+
else parent.type.to_s
|
192
|
+
end
|
193
|
+
end
|
194
|
+
el.options = {:category => HTML_PARSE_AS_SPAN.include?(ptype) ? :span : :block}
|
195
|
+
return
|
196
|
+
when :html_element
|
197
|
+
else return
|
198
|
+
end
|
199
|
+
|
200
|
+
type = el.value
|
201
|
+
remove_text_children(el) if REMOVE_TEXT_CHILDREN.include?(type)
|
202
|
+
|
203
|
+
mname = "convert_#{el.value}"
|
204
|
+
if do_conversion && self.class.method_defined?(mname)
|
205
|
+
send(mname, el)
|
206
|
+
elsif do_conversion && SIMPLE_ELEMENTS.include?(type)
|
207
|
+
set_basics(el, type.intern, HTML_SPAN_ELEMENTS.include?(type) ? :span : :block)
|
208
|
+
process_children(el, do_conversion, preserve_text)
|
209
|
+
else
|
210
|
+
process_html_element(el, do_conversion, preserve_text)
|
211
|
+
end
|
212
|
+
|
213
|
+
strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
|
214
|
+
remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
|
215
|
+
wrap_text_children(el) if WRAP_TEXT_CHILDREN.include?(type)
|
216
|
+
end
|
217
|
+
|
218
|
+
def process_children(el, do_conversion = true, preserve_text = false)
|
219
|
+
el.children.map! do |c|
|
220
|
+
if c.type == :text
|
221
|
+
process_text(c.value, preserve_text)
|
222
|
+
else
|
223
|
+
process(c, do_conversion, preserve_text, el)
|
224
|
+
c
|
225
|
+
end
|
226
|
+
end.flatten!
|
227
|
+
end
|
228
|
+
|
229
|
+
# Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
|
230
|
+
# entities in entity elements.
|
231
|
+
def process_text(raw, preserve = false)
|
232
|
+
raw.gsub!(/\s+/, ' ') unless preserve
|
233
|
+
src = StringScanner.new(raw)
|
234
|
+
result = []
|
235
|
+
while !src.eos?
|
236
|
+
if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
|
237
|
+
result << Element.new(:text, tmp)
|
238
|
+
src.scan(HTML_ENTITY_RE)
|
239
|
+
val = src[1] || (src[2] && src[2].to_i) || src[3].hex
|
240
|
+
result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
|
241
|
+
Element.new(:smart_quote, val.intern)
|
242
|
+
elsif %w{mdash ndash hellip laquo raquo}.include?(val)
|
243
|
+
Element.new(:typographic_sym, val.intern)
|
244
|
+
else
|
245
|
+
Element.new(:entity, entity(val), nil, :original => src.matched)
|
246
|
+
end
|
247
|
+
else
|
248
|
+
result << Element.new(:text, src.scan(/.*/m))
|
249
|
+
end
|
250
|
+
end
|
251
|
+
result
|
252
|
+
end
|
253
|
+
|
254
|
+
def process_html_element(el, do_conversion = true, preserve_text = false)
|
255
|
+
el.options = {:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
|
256
|
+
:parse_type => HTML_PARSE_AS[el.value]
|
257
|
+
}
|
258
|
+
process_children(el, do_conversion, preserve_text)
|
259
|
+
end
|
260
|
+
|
261
|
+
def remove_text_children(el)
|
262
|
+
el.children.delete_if {|c| c.type == :text}
|
263
|
+
end
|
264
|
+
|
265
|
+
def wrap_text_children(el)
|
266
|
+
tmp = []
|
267
|
+
last_is_p = false
|
268
|
+
el.children.each do |c|
|
269
|
+
if c.options[:category] != :block || c.type == :text
|
270
|
+
if !last_is_p
|
271
|
+
tmp << Element.new(:p, nil, nil, :transparent => true)
|
272
|
+
last_is_p = true
|
273
|
+
end
|
274
|
+
tmp.last.children << c
|
275
|
+
tmp
|
276
|
+
else
|
277
|
+
tmp << c
|
278
|
+
last_is_p = false
|
279
|
+
end
|
280
|
+
end
|
281
|
+
el.children = tmp
|
282
|
+
end
|
283
|
+
|
284
|
+
def strip_whitespace(el)
|
285
|
+
return if el.children.empty?
|
286
|
+
if el.children.first.type == :text
|
287
|
+
el.children.first.value.lstrip!
|
288
|
+
end
|
289
|
+
if el.children.last.type == :text
|
290
|
+
el.children.last.value.rstrip!
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def remove_whitespace_children(el)
|
295
|
+
i = -1
|
296
|
+
el.children.delete_if do |c|
|
297
|
+
i += 1
|
298
|
+
c.type == :text && c.value.strip.empty? &&
|
299
|
+
(i == 0 || i == el.children.length - 1 || (el.children[i-1].options[:category] == :block &&
|
300
|
+
el.children[i+1].options[:category] == :block))
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
def set_basics(el, type, category, opts = {})
|
305
|
+
el.type = type
|
306
|
+
el.options = {:category => category}.merge(opts)
|
307
|
+
el.value = nil
|
308
|
+
end
|
309
|
+
|
310
|
+
def extract_text(el, raw)
|
311
|
+
raw << el.value.to_s if el.type == :text
|
312
|
+
el.children.each {|c| extract_text(c, raw)}
|
313
|
+
end
|
314
|
+
|
315
|
+
def convert_a(el)
|
316
|
+
if el.attr['href']
|
317
|
+
set_basics(el, :a, :span)
|
318
|
+
process_children(el)
|
319
|
+
else
|
320
|
+
process_html_element(el, false)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def convert_b(el)
|
325
|
+
set_basics(el, :strong, :span)
|
326
|
+
process_children(el)
|
327
|
+
end
|
328
|
+
|
329
|
+
def convert_i(el)
|
330
|
+
set_basics(el, :em, :span)
|
331
|
+
process_children(el)
|
332
|
+
end
|
333
|
+
|
334
|
+
def convert_h1(el)
|
335
|
+
set_basics(el, :header, :block, :level => el.value[1..1].to_i)
|
336
|
+
extract_text(el, el.options[:raw_text] = '')
|
337
|
+
process_children(el)
|
338
|
+
end
|
339
|
+
%w{h2 h3 h4 h5 h6}.each do |i|
|
340
|
+
alias_method("convert_#{i}".to_sym, :convert_h1)
|
341
|
+
end
|
342
|
+
|
343
|
+
def convert_code(el)
|
344
|
+
raw = ''
|
345
|
+
extract_text(el, raw)
|
346
|
+
result = process_text(raw, true)
|
347
|
+
begin
|
348
|
+
str = result.inject('') do |mem, c|
|
349
|
+
if c.type == :text
|
350
|
+
mem << c.value
|
351
|
+
elsif c.type == :entity
|
352
|
+
if RUBY_VERSION >= '1.9'
|
353
|
+
mem << c.value.char.encode(@doc.parse_infos[:encoding])
|
354
|
+
elsif [60, 62, 34, 38].include?(c.value.code_point)
|
355
|
+
mem << c.value.code_point.chr
|
356
|
+
end
|
357
|
+
elsif c.type == :smart_quote || c.type == :typographic_sym
|
358
|
+
mem << entity(c.value.to_s).char.encode(@doc.parse_infos[:encoding])
|
359
|
+
else
|
360
|
+
raise "Bug - please report"
|
361
|
+
end
|
362
|
+
end
|
363
|
+
result.clear
|
364
|
+
result << Element.new(:text, str)
|
365
|
+
rescue
|
366
|
+
end
|
367
|
+
if result.length > 1 || result.first.type != :text
|
368
|
+
process_html_element(el, false, true)
|
369
|
+
else
|
370
|
+
if el.value == 'code'
|
371
|
+
set_basics(el, :codespan, :span)
|
372
|
+
else
|
373
|
+
set_basics(el, :codeblock, :block)
|
374
|
+
end
|
375
|
+
el.value = result.first.value
|
376
|
+
el.children.clear
|
377
|
+
end
|
378
|
+
end
|
379
|
+
alias :convert_pre :convert_code
|
380
|
+
|
381
|
+
def convert_table(el)
|
382
|
+
if !is_simple_table?(el)
|
383
|
+
process_html_element(el, false)
|
384
|
+
return
|
385
|
+
end
|
386
|
+
process_children(el)
|
387
|
+
set_basics(el, :table, :block)
|
388
|
+
el.options[:alignment] = []
|
389
|
+
calc_alignment = lambda do |c|
|
390
|
+
if c.type == :tr && el.options[:alignment].empty?
|
391
|
+
el.options[:alignment] = [:default] * c.children.length
|
392
|
+
break
|
393
|
+
else
|
394
|
+
c.children.each {|cc| calc_alignment.call(cc)}
|
395
|
+
end
|
396
|
+
end
|
397
|
+
calc_alignment.call(el)
|
398
|
+
if el.children.first.type == :tr
|
399
|
+
tbody = Element.new(:tbody, nil, nil, :category => :block)
|
400
|
+
tbody.children = el.children
|
401
|
+
el.children = [tbody]
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
def is_simple_table?(el)
|
406
|
+
only_phrasing_content = lambda do |c|
|
407
|
+
c.children.all? do |cc|
|
408
|
+
(cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
|
409
|
+
end
|
410
|
+
end
|
411
|
+
check_cells = Proc.new do |c|
|
412
|
+
if c.value == 'th' || c.value == 'td'
|
413
|
+
return false if !only_phrasing_content.call(c)
|
414
|
+
else
|
415
|
+
c.children.each {|cc| check_cells.call(cc)}
|
416
|
+
end
|
417
|
+
end
|
418
|
+
check_cells.call(el)
|
419
|
+
|
420
|
+
check_rows = lambda do |t, type|
|
421
|
+
t.children.all? {|r| (r.value == 'tr' || r.type == :text) && r.children.all? {|c| c.value == type || c.type == :text}}
|
422
|
+
end
|
423
|
+
check_rows.call(el, 'td') ||
|
424
|
+
(el.children.all? do |t|
|
425
|
+
t.type == :text || (t.value == 'thead' && check_rows.call(t, 'th')) ||
|
426
|
+
((t.value == 'tfoot' || t.value == 'tbody') && check_rows.call(t, 'td'))
|
427
|
+
end && el.children.any? {|t| t.value == 'tbody'})
|
428
|
+
end
|
429
|
+
|
430
|
+
def convert_div(el)
|
431
|
+
if !is_math_tag?(el)
|
432
|
+
process_html_element(el)
|
433
|
+
else
|
434
|
+
handle_math_tag(el)
|
435
|
+
end
|
436
|
+
end
|
437
|
+
alias :convert_span :convert_div
|
438
|
+
|
439
|
+
def is_math_tag?(el)
|
440
|
+
el.attr['class'].to_s =~ /\bmath\b/ &&
|
441
|
+
el.children.size == 1 && el.children.first.type == :text
|
442
|
+
end
|
443
|
+
|
444
|
+
def handle_math_tag(el)
|
445
|
+
set_basics(el, :math, (el.value == 'div' ? :block : :span))
|
446
|
+
el.value = el.children.shift.value
|
447
|
+
if el.attr['class'] =~ /^\s*math\s*$/
|
448
|
+
el.attr.delete('class')
|
449
|
+
else
|
450
|
+
el.attr['class'].sub!(/\s?math/, '')
|
451
|
+
end
|
452
|
+
el.value.gsub!(/&(amp|quot|gt|lt);/) do |m|
|
453
|
+
case m
|
454
|
+
when '&' then '&'
|
455
|
+
when '"' then '"'
|
456
|
+
when '>' then '>'
|
457
|
+
when '<' then '<'
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
include Parser
|
464
|
+
|
465
|
+
# Parse +source+ as HTML document and return the created +tree+.
|
466
|
+
def parse(source)
|
467
|
+
@stack = []
|
468
|
+
@tree = Element.new(:root)
|
469
|
+
@src = StringScanner.new(adapt_source(source))
|
470
|
+
|
471
|
+
while true
|
472
|
+
if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
|
473
|
+
@tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
|
474
|
+
elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
|
475
|
+
@tree.children << Element.new(:html_doctype, result.strip, nil, :category => :block)
|
476
|
+
elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
|
477
|
+
@tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
|
478
|
+
else
|
479
|
+
break
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
tag_handler = lambda do |c, closed|
|
484
|
+
parse_raw_html(c, &tag_handler) if !closed
|
485
|
+
end
|
486
|
+
parse_raw_html(@tree, &tag_handler)
|
487
|
+
|
488
|
+
ec = ElementConverter.new(@doc)
|
489
|
+
@tree.children.each {|c| ec.process(c)}
|
490
|
+
ec.remove_whitespace_children(@tree)
|
491
|
+
@tree
|
492
|
+
end
|
493
|
+
|
494
|
+
end
|
495
|
+
|
496
|
+
end
|
497
|
+
|
498
|
+
end
|
499
|
+
|