newstile 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (405) hide show
  1. data/AUTHORS +1 -0
  2. data/CONTRIBUTERS +6 -0
  3. data/COPYING +24 -0
  4. data/ChangeLog +5489 -0
  5. data/GPL +674 -0
  6. data/README +31 -0
  7. data/Rakefile +342 -0
  8. data/VERSION +1 -0
  9. data/benchmark/benchmark.rb +34 -0
  10. data/benchmark/generate_data.rb +112 -0
  11. data/benchmark/historic-jruby-1.4.0.dat +7 -0
  12. data/benchmark/historic-ruby-1.8.6.dat +7 -0
  13. data/benchmark/historic-ruby-1.8.7.dat +7 -0
  14. data/benchmark/historic-ruby-1.9.1p243.dat +7 -0
  15. data/benchmark/historic-ruby-1.9.2dev.dat +7 -0
  16. data/benchmark/mdbasics.text +306 -0
  17. data/benchmark/mdsyntax.text +888 -0
  18. data/benchmark/static-jruby-1.4.0.dat +7 -0
  19. data/benchmark/static-ruby-1.8.6.dat +7 -0
  20. data/benchmark/static-ruby-1.8.7.dat +7 -0
  21. data/benchmark/static-ruby-1.9.1p243.dat +7 -0
  22. data/benchmark/static-ruby-1.9.2dev.dat +7 -0
  23. data/benchmark/testing.sh +9 -0
  24. data/benchmark/timing.sh +10 -0
  25. data/bin/newstile +82 -0
  26. data/data/newstile/document.html +18 -0
  27. data/data/newstile/document.latex +43 -0
  28. data/doc/default.scss.css +519 -0
  29. data/doc/default.template +80 -0
  30. data/doc/documentation.page +72 -0
  31. data/doc/index.page +96 -0
  32. data/doc/installation.page +90 -0
  33. data/doc/links.markdown +6 -0
  34. data/doc/news.feed +10 -0
  35. data/doc/news.page +28 -0
  36. data/doc/quickref.page +564 -0
  37. data/doc/syntax.page +1615 -0
  38. data/doc/tests.page +51 -0
  39. data/doc/virtual +2 -0
  40. data/lib/newstile.rb +23 -0
  41. data/lib/newstile/compatibility.rb +34 -0
  42. data/lib/newstile/converter.rb +43 -0
  43. data/lib/newstile/converter/base.rb +111 -0
  44. data/lib/newstile/converter/html.rb +405 -0
  45. data/lib/newstile/converter/latex.rb +577 -0
  46. data/lib/newstile/converter/markdown.rb +426 -0
  47. data/lib/newstile/converter/newstile.rb +426 -0
  48. data/lib/newstile/document.rb +168 -0
  49. data/lib/newstile/error.rb +27 -0
  50. data/lib/newstile/options.rb +296 -0
  51. data/lib/newstile/parser.rb +39 -0
  52. data/lib/newstile/parser/base.rb +94 -0
  53. data/lib/newstile/parser/html.rb +499 -0
  54. data/lib/newstile/parser/newstile.rb +325 -0
  55. data/lib/newstile/parser/newstile/abbreviation.rb +66 -0
  56. data/lib/newstile/parser/newstile/attribute_list.rb +111 -0
  57. data/lib/newstile/parser/newstile/autolink.rb +54 -0
  58. data/lib/newstile/parser/newstile/blank_line.rb +43 -0
  59. data/lib/newstile/parser/newstile/block_boundary.rb +46 -0
  60. data/lib/newstile/parser/newstile/blockquote.rb +63 -0
  61. data/lib/newstile/parser/newstile/codeblock.rb +60 -0
  62. data/lib/newstile/parser/newstile/codespan.rb +57 -0
  63. data/lib/newstile/parser/newstile/emphasis.rb +70 -0
  64. data/lib/newstile/parser/newstile/eob.rb +39 -0
  65. data/lib/newstile/parser/newstile/escaped_chars.rb +38 -0
  66. data/lib/newstile/parser/newstile/extension.rb +116 -0
  67. data/lib/newstile/parser/newstile/footnote.rb +74 -0
  68. data/lib/newstile/parser/newstile/header.rb +84 -0
  69. data/lib/newstile/parser/newstile/horizontal_rule.rb +39 -0
  70. data/lib/newstile/parser/newstile/html.rb +175 -0
  71. data/lib/newstile/parser/newstile/html_entity.rb +39 -0
  72. data/lib/newstile/parser/newstile/line_break.rb +38 -0
  73. data/lib/newstile/parser/newstile/link.rb +177 -0
  74. data/lib/newstile/parser/newstile/list.rb +239 -0
  75. data/lib/newstile/parser/newstile/math.rb +64 -0
  76. data/lib/newstile/parser/newstile/paragraph.rb +55 -0
  77. data/lib/newstile/parser/newstile/smart_quotes.rb +214 -0
  78. data/lib/newstile/parser/newstile/table.rb +134 -0
  79. data/lib/newstile/parser/newstile/typographic_symbol.rb +54 -0
  80. data/lib/newstile/utils.rb +37 -0
  81. data/lib/newstile/utils/entities.rb +336 -0
  82. data/lib/newstile/utils/html.rb +75 -0
  83. data/lib/newstile/utils/ordered_hash.rb +79 -0
  84. data/lib/newstile/version.rb +28 -0
  85. data/man/man1/newstile.1 +246 -0
  86. data/setup.rb +1585 -0
  87. data/test/run_tests.rb +59 -0
  88. data/test/test_files.rb +162 -0
  89. data/test/testcases/block/01_blank_line/spaces.html +1 -0
  90. data/test/testcases/block/01_blank_line/spaces.text +3 -0
  91. data/test/testcases/block/01_blank_line/tabs.html +1 -0
  92. data/test/testcases/block/01_blank_line/tabs.text +6 -0
  93. data/test/testcases/block/02_eob/beginning.html +1 -0
  94. data/test/testcases/block/02_eob/beginning.text +3 -0
  95. data/test/testcases/block/02_eob/end.html +1 -0
  96. data/test/testcases/block/02_eob/end.text +3 -0
  97. data/test/testcases/block/02_eob/middle.html +1 -0
  98. data/test/testcases/block/02_eob/middle.text +5 -0
  99. data/test/testcases/block/03_paragraph/indented.html +18 -0
  100. data/test/testcases/block/03_paragraph/indented.text +19 -0
  101. data/test/testcases/block/03_paragraph/no_newline_at_end.html +5 -0
  102. data/test/testcases/block/03_paragraph/no_newline_at_end.text +5 -0
  103. data/test/testcases/block/03_paragraph/one_para.html +1 -0
  104. data/test/testcases/block/03_paragraph/one_para.text +1 -0
  105. data/test/testcases/block/03_paragraph/two_para.html +4 -0
  106. data/test/testcases/block/03_paragraph/two_para.text +4 -0
  107. data/test/testcases/block/04_header/atx_header.html +37 -0
  108. data/test/testcases/block/04_header/atx_header.text +34 -0
  109. data/test/testcases/block/04_header/atx_header_no_newline_at_end.html +1 -0
  110. data/test/testcases/block/04_header/atx_header_no_newline_at_end.text +1 -0
  111. data/test/testcases/block/04_header/setext_header.html +30 -0
  112. data/test/testcases/block/04_header/setext_header.html.19 +30 -0
  113. data/test/testcases/block/04_header/setext_header.text +36 -0
  114. data/test/testcases/block/04_header/setext_header_no_newline_at_end.html +1 -0
  115. data/test/testcases/block/04_header/setext_header_no_newline_at_end.text +2 -0
  116. data/test/testcases/block/04_header/with_auto_id_prefix.html +3 -0
  117. data/test/testcases/block/04_header/with_auto_id_prefix.options +2 -0
  118. data/test/testcases/block/04_header/with_auto_id_prefix.text +3 -0
  119. data/test/testcases/block/04_header/with_auto_ids.html +17 -0
  120. data/test/testcases/block/04_header/with_auto_ids.options +1 -0
  121. data/test/testcases/block/04_header/with_auto_ids.text +19 -0
  122. data/test/testcases/block/05_blockquote/indented.html +25 -0
  123. data/test/testcases/block/05_blockquote/indented.text +14 -0
  124. data/test/testcases/block/05_blockquote/lazy.html +34 -0
  125. data/test/testcases/block/05_blockquote/lazy.text +20 -0
  126. data/test/testcases/block/05_blockquote/nested.html +10 -0
  127. data/test/testcases/block/05_blockquote/nested.text +6 -0
  128. data/test/testcases/block/05_blockquote/no_newline_at_end.html +4 -0
  129. data/test/testcases/block/05_blockquote/no_newline_at_end.text +2 -0
  130. data/test/testcases/block/05_blockquote/with_code_blocks.html +15 -0
  131. data/test/testcases/block/05_blockquote/with_code_blocks.text +11 -0
  132. data/test/testcases/block/06_codeblock/error.html +4 -0
  133. data/test/testcases/block/06_codeblock/error.text +4 -0
  134. data/test/testcases/block/06_codeblock/lazy.html +4 -0
  135. data/test/testcases/block/06_codeblock/lazy.text +5 -0
  136. data/test/testcases/block/06_codeblock/no_newline_at_end.html +2 -0
  137. data/test/testcases/block/06_codeblock/no_newline_at_end.text +1 -0
  138. data/test/testcases/block/06_codeblock/no_newline_at_end_1.html +2 -0
  139. data/test/testcases/block/06_codeblock/no_newline_at_end_1.text +2 -0
  140. data/test/testcases/block/06_codeblock/normal.html +13 -0
  141. data/test/testcases/block/06_codeblock/normal.text +10 -0
  142. data/test/testcases/block/06_codeblock/tilde_syntax.html +7 -0
  143. data/test/testcases/block/06_codeblock/tilde_syntax.text +9 -0
  144. data/test/testcases/block/06_codeblock/whitespace.html +3 -0
  145. data/test/testcases/block/06_codeblock/whitespace.text +3 -0
  146. data/test/testcases/block/06_codeblock/with_blank_line.html +13 -0
  147. data/test/testcases/block/06_codeblock/with_blank_line.text +12 -0
  148. data/test/testcases/block/06_codeblock/with_eob_marker.html +6 -0
  149. data/test/testcases/block/06_codeblock/with_eob_marker.text +5 -0
  150. data/test/testcases/block/06_codeblock/with_ial.html +6 -0
  151. data/test/testcases/block/06_codeblock/with_ial.text +5 -0
  152. data/test/testcases/block/07_horizontal_rule/error.html +7 -0
  153. data/test/testcases/block/07_horizontal_rule/error.html.19 +7 -0
  154. data/test/testcases/block/07_horizontal_rule/error.text +7 -0
  155. data/test/testcases/block/07_horizontal_rule/normal.html +17 -0
  156. data/test/testcases/block/07_horizontal_rule/normal.text +17 -0
  157. data/test/testcases/block/07_horizontal_rule/sepspaces.html +3 -0
  158. data/test/testcases/block/07_horizontal_rule/sepspaces.text +3 -0
  159. data/test/testcases/block/07_horizontal_rule/septabs.html +3 -0
  160. data/test/testcases/block/07_horizontal_rule/septabs.text +3 -0
  161. data/test/testcases/block/08_list/escaping.html +17 -0
  162. data/test/testcases/block/08_list/escaping.text +17 -0
  163. data/test/testcases/block/08_list/item_ial.html +7 -0
  164. data/test/testcases/block/08_list/item_ial.text +5 -0
  165. data/test/testcases/block/08_list/lazy.html +39 -0
  166. data/test/testcases/block/08_list/lazy.text +29 -0
  167. data/test/testcases/block/08_list/list_and_hr.html +9 -0
  168. data/test/testcases/block/08_list/list_and_hr.text +5 -0
  169. data/test/testcases/block/08_list/list_and_others.html +40 -0
  170. data/test/testcases/block/08_list/list_and_others.text +26 -0
  171. data/test/testcases/block/08_list/mixed.html +117 -0
  172. data/test/testcases/block/08_list/mixed.text +66 -0
  173. data/test/testcases/block/08_list/nested.html +17 -0
  174. data/test/testcases/block/08_list/nested.text +7 -0
  175. data/test/testcases/block/08_list/other_first_element.html +39 -0
  176. data/test/testcases/block/08_list/other_first_element.text +18 -0
  177. data/test/testcases/block/08_list/simple_ol.html +19 -0
  178. data/test/testcases/block/08_list/simple_ol.text +13 -0
  179. data/test/testcases/block/08_list/simple_ul.html +48 -0
  180. data/test/testcases/block/08_list/simple_ul.text +36 -0
  181. data/test/testcases/block/08_list/single_item.html +3 -0
  182. data/test/testcases/block/08_list/single_item.text +1 -0
  183. data/test/testcases/block/08_list/special_cases.html +55 -0
  184. data/test/testcases/block/08_list/special_cases.text +35 -0
  185. data/test/testcases/block/09_html/comment.html +18 -0
  186. data/test/testcases/block/09_html/comment.text +15 -0
  187. data/test/testcases/block/09_html/content_model/deflists.html +6 -0
  188. data/test/testcases/block/09_html/content_model/deflists.options +1 -0
  189. data/test/testcases/block/09_html/content_model/deflists.text +6 -0
  190. data/test/testcases/block/09_html/content_model/tables.html +14 -0
  191. data/test/testcases/block/09_html/content_model/tables.options +1 -0
  192. data/test/testcases/block/09_html/content_model/tables.text +14 -0
  193. data/test/testcases/block/09_html/html_and_codeblocks.html +15 -0
  194. data/test/testcases/block/09_html/html_and_codeblocks.options +1 -0
  195. data/test/testcases/block/09_html/html_and_codeblocks.text +13 -0
  196. data/test/testcases/block/09_html/html_to_native/code.html +10 -0
  197. data/test/testcases/block/09_html/html_to_native/code.text +9 -0
  198. data/test/testcases/block/09_html/html_to_native/comment.html +7 -0
  199. data/test/testcases/block/09_html/html_to_native/comment.text +8 -0
  200. data/test/testcases/block/09_html/html_to_native/emphasis.html +3 -0
  201. data/test/testcases/block/09_html/html_to_native/emphasis.text +3 -0
  202. data/test/testcases/block/09_html/html_to_native/entity.html +1 -0
  203. data/test/testcases/block/09_html/html_to_native/entity.text +1 -0
  204. data/test/testcases/block/09_html/html_to_native/header.html +6 -0
  205. data/test/testcases/block/09_html/html_to_native/header.options +2 -0
  206. data/test/testcases/block/09_html/html_to_native/header.text +6 -0
  207. data/test/testcases/block/09_html/html_to_native/list_dl.html +8 -0
  208. data/test/testcases/block/09_html/html_to_native/list_dl.text +8 -0
  209. data/test/testcases/block/09_html/html_to_native/list_ol.html +15 -0
  210. data/test/testcases/block/09_html/html_to_native/list_ol.text +17 -0
  211. data/test/testcases/block/09_html/html_to_native/list_ul.html +19 -0
  212. data/test/testcases/block/09_html/html_to_native/list_ul.text +22 -0
  213. data/test/testcases/block/09_html/html_to_native/options +1 -0
  214. data/test/testcases/block/09_html/html_to_native/paragraph.html +3 -0
  215. data/test/testcases/block/09_html/html_to_native/paragraph.text +4 -0
  216. data/test/testcases/block/09_html/html_to_native/table_normal.html +14 -0
  217. data/test/testcases/block/09_html/html_to_native/table_normal.text +12 -0
  218. data/test/testcases/block/09_html/html_to_native/table_simple.html +48 -0
  219. data/test/testcases/block/09_html/html_to_native/table_simple.text +56 -0
  220. data/test/testcases/block/09_html/html_to_native/typography.html +1 -0
  221. data/test/testcases/block/09_html/html_to_native/typography.html.19 +1 -0
  222. data/test/testcases/block/09_html/html_to_native/typography.text +1 -0
  223. data/test/testcases/block/09_html/invalid_html_1.html +5 -0
  224. data/test/testcases/block/09_html/invalid_html_1.text +5 -0
  225. data/test/testcases/block/09_html/invalid_html_2.html +5 -0
  226. data/test/testcases/block/09_html/invalid_html_2.text +5 -0
  227. data/test/testcases/block/09_html/markdown_attr.html +38 -0
  228. data/test/testcases/block/09_html/markdown_attr.text +38 -0
  229. data/test/testcases/block/09_html/not_parsed.html +24 -0
  230. data/test/testcases/block/09_html/not_parsed.text +24 -0
  231. data/test/testcases/block/09_html/parse_as_raw.html +30 -0
  232. data/test/testcases/block/09_html/parse_as_raw.options +1 -0
  233. data/test/testcases/block/09_html/parse_as_raw.text +29 -0
  234. data/test/testcases/block/09_html/parse_as_span.html +12 -0
  235. data/test/testcases/block/09_html/parse_as_span.options +1 -0
  236. data/test/testcases/block/09_html/parse_as_span.text +9 -0
  237. data/test/testcases/block/09_html/parse_block_html.html +21 -0
  238. data/test/testcases/block/09_html/parse_block_html.options +1 -0
  239. data/test/testcases/block/09_html/parse_block_html.text +17 -0
  240. data/test/testcases/block/09_html/processing_instruction.html +13 -0
  241. data/test/testcases/block/09_html/processing_instruction.text +12 -0
  242. data/test/testcases/block/09_html/simple.html +64 -0
  243. data/test/testcases/block/09_html/simple.html.19 +64 -0
  244. data/test/testcases/block/09_html/simple.options +1 -0
  245. data/test/testcases/block/09_html/simple.text +59 -0
  246. data/test/testcases/block/10_ald/simple.html +2 -0
  247. data/test/testcases/block/10_ald/simple.text +8 -0
  248. data/test/testcases/block/11_ial/auto_id_and_ial.html +1 -0
  249. data/test/testcases/block/11_ial/auto_id_and_ial.options +1 -0
  250. data/test/testcases/block/11_ial/auto_id_and_ial.text +2 -0
  251. data/test/testcases/block/11_ial/simple.html +25 -0
  252. data/test/testcases/block/11_ial/simple.text +34 -0
  253. data/test/testcases/block/12_extension/comment.html +8 -0
  254. data/test/testcases/block/12_extension/comment.text +12 -0
  255. data/test/testcases/block/12_extension/ignored.html +8 -0
  256. data/test/testcases/block/12_extension/ignored.text +8 -0
  257. data/test/testcases/block/12_extension/nomarkdown.html +10 -0
  258. data/test/testcases/block/12_extension/nomarkdown.kramdown +20 -0
  259. data/test/testcases/block/12_extension/nomarkdown.latex +13 -0
  260. data/test/testcases/block/12_extension/nomarkdown.text +21 -0
  261. data/test/testcases/block/12_extension/options.html +21 -0
  262. data/test/testcases/block/12_extension/options.text +21 -0
  263. data/test/testcases/block/12_extension/options2.html +10 -0
  264. data/test/testcases/block/12_extension/options2.text +5 -0
  265. data/test/testcases/block/12_extension/options3.html +7 -0
  266. data/test/testcases/block/12_extension/options3.text +7 -0
  267. data/test/testcases/block/13_definition_list/definition_at_beginning.html +1 -0
  268. data/test/testcases/block/13_definition_list/definition_at_beginning.text +1 -0
  269. data/test/testcases/block/13_definition_list/item_ial.html +12 -0
  270. data/test/testcases/block/13_definition_list/item_ial.text +8 -0
  271. data/test/testcases/block/13_definition_list/multiple_terms.html +13 -0
  272. data/test/testcases/block/13_definition_list/multiple_terms.text +10 -0
  273. data/test/testcases/block/13_definition_list/no_def_list.html +2 -0
  274. data/test/testcases/block/13_definition_list/no_def_list.text +2 -0
  275. data/test/testcases/block/13_definition_list/para_wrapping.html +10 -0
  276. data/test/testcases/block/13_definition_list/para_wrapping.text +6 -0
  277. data/test/testcases/block/13_definition_list/separated_by_eob.html +8 -0
  278. data/test/testcases/block/13_definition_list/separated_by_eob.text +5 -0
  279. data/test/testcases/block/13_definition_list/simple.html +8 -0
  280. data/test/testcases/block/13_definition_list/simple.text +7 -0
  281. data/test/testcases/block/13_definition_list/styled_terms.html +4 -0
  282. data/test/testcases/block/13_definition_list/styled_terms.text +2 -0
  283. data/test/testcases/block/13_definition_list/too_much_space.html +3 -0
  284. data/test/testcases/block/13_definition_list/too_much_space.text +4 -0
  285. data/test/testcases/block/13_definition_list/with_blocks.html +38 -0
  286. data/test/testcases/block/13_definition_list/with_blocks.text +24 -0
  287. data/test/testcases/block/14_table/errors.html +8 -0
  288. data/test/testcases/block/14_table/errors.text +9 -0
  289. data/test/testcases/block/14_table/footer.html +65 -0
  290. data/test/testcases/block/14_table/footer.text +25 -0
  291. data/test/testcases/block/14_table/header.html +103 -0
  292. data/test/testcases/block/14_table/header.text +32 -0
  293. data/test/testcases/block/14_table/no_table.html +3 -0
  294. data/test/testcases/block/14_table/no_table.text +3 -0
  295. data/test/testcases/block/14_table/simple.html +139 -0
  296. data/test/testcases/block/14_table/simple.text +38 -0
  297. data/test/testcases/block/15_math/normal.html +26 -0
  298. data/test/testcases/block/15_math/normal.text +28 -0
  299. data/test/testcases/block/16_toc/no_toc_depth.html +33 -0
  300. data/test/testcases/block/16_toc/no_toc_depth.options +1 -0
  301. data/test/testcases/block/16_toc/no_toc_depth.text +16 -0
  302. data/test/testcases/block/16_toc/toc_depth_2.html +24 -0
  303. data/test/testcases/block/16_toc/toc_depth_2.options +1 -0
  304. data/test/testcases/block/16_toc/toc_depth_2.text +16 -0
  305. data/test/testcases/encoding.html +46 -0
  306. data/test/testcases/encoding.text +28 -0
  307. data/test/testcases/span/01_link/empty.html +5 -0
  308. data/test/testcases/span/01_link/empty.text +5 -0
  309. data/test/testcases/span/01_link/image_in_a.html +5 -0
  310. data/test/testcases/span/01_link/image_in_a.text +5 -0
  311. data/test/testcases/span/01_link/imagelinks.html +14 -0
  312. data/test/testcases/span/01_link/imagelinks.text +16 -0
  313. data/test/testcases/span/01_link/inline.html +46 -0
  314. data/test/testcases/span/01_link/inline.html.19 +46 -0
  315. data/test/testcases/span/01_link/inline.text +48 -0
  316. data/test/testcases/span/01_link/link_defs.html +9 -0
  317. data/test/testcases/span/01_link/link_defs.text +26 -0
  318. data/test/testcases/span/01_link/links_with_angle_brackets.html +3 -0
  319. data/test/testcases/span/01_link/links_with_angle_brackets.text +3 -0
  320. data/test/testcases/span/01_link/reference.html +35 -0
  321. data/test/testcases/span/01_link/reference.html.19 +35 -0
  322. data/test/testcases/span/01_link/reference.text +47 -0
  323. data/test/testcases/span/02_emphasis/empty.html +3 -0
  324. data/test/testcases/span/02_emphasis/empty.text +3 -0
  325. data/test/testcases/span/02_emphasis/errors.html +9 -0
  326. data/test/testcases/span/02_emphasis/errors.text +9 -0
  327. data/test/testcases/span/02_emphasis/nesting.html +38 -0
  328. data/test/testcases/span/02_emphasis/nesting.text +33 -0
  329. data/test/testcases/span/02_emphasis/normal.html +46 -0
  330. data/test/testcases/span/02_emphasis/normal.text +46 -0
  331. data/test/testcases/span/03_codespan/empty.html +5 -0
  332. data/test/testcases/span/03_codespan/empty.text +5 -0
  333. data/test/testcases/span/03_codespan/errors.html +1 -0
  334. data/test/testcases/span/03_codespan/errors.text +1 -0
  335. data/test/testcases/span/03_codespan/highlighting.html +1 -0
  336. data/test/testcases/span/03_codespan/highlighting.text +1 -0
  337. data/test/testcases/span/03_codespan/normal.html +16 -0
  338. data/test/testcases/span/03_codespan/normal.text +16 -0
  339. data/test/testcases/span/04_footnote/definitions.html +17 -0
  340. data/test/testcases/span/04_footnote/definitions.latex +17 -0
  341. data/test/testcases/span/04_footnote/definitions.text +24 -0
  342. data/test/testcases/span/04_footnote/footnote_nr.html +12 -0
  343. data/test/testcases/span/04_footnote/footnote_nr.latex +2 -0
  344. data/test/testcases/span/04_footnote/footnote_nr.options +1 -0
  345. data/test/testcases/span/04_footnote/footnote_nr.text +4 -0
  346. data/test/testcases/span/04_footnote/markers.html +46 -0
  347. data/test/testcases/span/04_footnote/markers.latex +23 -0
  348. data/test/testcases/span/04_footnote/markers.text +26 -0
  349. data/test/testcases/span/05_html/across_lines.html +1 -0
  350. data/test/testcases/span/05_html/across_lines.text +2 -0
  351. data/test/testcases/span/05_html/invalid.html +1 -0
  352. data/test/testcases/span/05_html/invalid.text +1 -0
  353. data/test/testcases/span/05_html/link_with_mailto.html +1 -0
  354. data/test/testcases/span/05_html/link_with_mailto.text +1 -0
  355. data/test/testcases/span/05_html/markdown_attr.html +6 -0
  356. data/test/testcases/span/05_html/markdown_attr.text +6 -0
  357. data/test/testcases/span/05_html/normal.html +30 -0
  358. data/test/testcases/span/05_html/normal.text +30 -0
  359. data/test/testcases/span/abbreviations/abbrev.html +8 -0
  360. data/test/testcases/span/abbreviations/abbrev.text +15 -0
  361. data/test/testcases/span/abbreviations/abbrev_defs.html +2 -0
  362. data/test/testcases/span/abbreviations/abbrev_defs.text +5 -0
  363. data/test/testcases/span/autolinks/url_links.html +12 -0
  364. data/test/testcases/span/autolinks/url_links.text +12 -0
  365. data/test/testcases/span/escaped_chars/normal.html +43 -0
  366. data/test/testcases/span/escaped_chars/normal.text +43 -0
  367. data/test/testcases/span/extension/comment.html +6 -0
  368. data/test/testcases/span/extension/comment.text +6 -0
  369. data/test/testcases/span/extension/ignored.html +1 -0
  370. data/test/testcases/span/extension/ignored.text +1 -0
  371. data/test/testcases/span/extension/nomarkdown.html +1 -0
  372. data/test/testcases/span/extension/nomarkdown.text +1 -0
  373. data/test/testcases/span/extension/options.html +1 -0
  374. data/test/testcases/span/extension/options.text +1 -0
  375. data/test/testcases/span/ial/simple.html +6 -0
  376. data/test/testcases/span/ial/simple.text +6 -0
  377. data/test/testcases/span/line_breaks/normal.html +11 -0
  378. data/test/testcases/span/line_breaks/normal.latex +12 -0
  379. data/test/testcases/span/line_breaks/normal.text +11 -0
  380. data/test/testcases/span/math/normal.html +5 -0
  381. data/test/testcases/span/math/normal.text +5 -0
  382. data/test/testcases/span/text_substitutions/entities.html +4 -0
  383. data/test/testcases/span/text_substitutions/entities.options +1 -0
  384. data/test/testcases/span/text_substitutions/entities.text +4 -0
  385. data/test/testcases/span/text_substitutions/entities_as_char.html +1 -0
  386. data/test/testcases/span/text_substitutions/entities_as_char.html.19 +1 -0
  387. data/test/testcases/span/text_substitutions/entities_as_char.options +1 -0
  388. data/test/testcases/span/text_substitutions/entities_as_char.text +1 -0
  389. data/test/testcases/span/text_substitutions/entities_as_input.html +1 -0
  390. data/test/testcases/span/text_substitutions/entities_as_input.options +1 -0
  391. data/test/testcases/span/text_substitutions/entities_as_input.text +1 -0
  392. data/test/testcases/span/text_substitutions/entities_numeric.html +1 -0
  393. data/test/testcases/span/text_substitutions/entities_numeric.options +1 -0
  394. data/test/testcases/span/text_substitutions/entities_numeric.text +1 -0
  395. data/test/testcases/span/text_substitutions/entities_symbolic.html +1 -0
  396. data/test/testcases/span/text_substitutions/entities_symbolic.options +1 -0
  397. data/test/testcases/span/text_substitutions/entities_symbolic.text +1 -0
  398. data/test/testcases/span/text_substitutions/greaterthan.html +1 -0
  399. data/test/testcases/span/text_substitutions/greaterthan.text +1 -0
  400. data/test/testcases/span/text_substitutions/lowerthan.html +1 -0
  401. data/test/testcases/span/text_substitutions/lowerthan.text +1 -0
  402. data/test/testcases/span/text_substitutions/typography.html +18 -0
  403. data/test/testcases/span/text_substitutions/typography.html.19 +18 -0
  404. data/test/testcases/span/text_substitutions/typography.text +18 -0
  405. metadata +476 -0
@@ -0,0 +1,39 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ module Newstile
24
+
25
+ # == Parser Module
26
+ #
27
+ # This module contains all available parsers. Currently, there two parsers:
28
+ #
29
+ # * Newstile for parsing documents in newstile format
30
+ # * Html for parsing HTML documents
31
+ module Parser
32
+
33
+ autoload :Base, 'newstile/parser/base'
34
+ autoload :Newstile, 'newstile/parser/newstile'
35
+ autoload :Html, 'newstile/parser/html'
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,94 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ module Newstile
24
+
25
+ module Parser
26
+
27
+ # == Base class for parsers
28
+ #
29
+ # This class serves as base class for parsers. It provides common methods that can/should be
30
+ # used by all parsers, especially by those using StringScanner for parsing.
31
+ #
32
+ class Base
33
+
34
+ # Initialize the parser with the given Newstile document +doc+.
35
+ def initialize(doc)
36
+ @doc = doc
37
+ @text_type = :text
38
+ end
39
+ private_class_method(:new, :allocate)
40
+
41
+ # Parse the +source+ string into an element tree, using the information provided by the
42
+ # Newstile document +doc+.
43
+ #
44
+ # Initializes a new instance of the calling class and then calls the #parse method that must
45
+ # be implemented by each subclass.
46
+ def self.parse(source, doc)
47
+ new(doc).parse(source)
48
+ end
49
+
50
+
51
+ # Add the given warning +text+ to the warning array of the Newstile document.
52
+ def warning(text)
53
+ @doc.warnings << text
54
+ #TODO: add position information
55
+ end
56
+
57
+ # Modify the string +source+ to be usable by the parser.
58
+ def adapt_source(source)
59
+ source.gsub(/\r\n?/, "\n").chomp + "\n"
60
+ end
61
+
62
+ # This helper method adds the given +text+ either to the last element in the +tree+ if it is a
63
+ # +type+ element or creates a new text element with the given +type+.
64
+ def add_text(text, tree = @tree, type = @text_type)
65
+ if tree.children.last && tree.children.last.type == type
66
+ tree.children.last.value << text
67
+ elsif !text.empty?
68
+ tree.children << Element.new(type, text)
69
+ end
70
+ end
71
+
72
+ # Extract the part of the StringScanner +srcscan+ backed string specified by the +range+. This
73
+ # method also works correctly under Ruby 1.9.
74
+ def extract_string(range, strscan)
75
+ result = nil
76
+ if RUBY_VERSION >= '1.9'
77
+ begin
78
+ enc = strscan.string.encoding
79
+ strscan.string.force_encoding('ASCII-8BIT')
80
+ result = strscan.string[range].force_encoding(enc)
81
+ ensure
82
+ strscan.string.force_encoding(enc)
83
+ end
84
+ else
85
+ result = strscan.string[range]
86
+ end
87
+ result
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,499 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'rexml/parsers/baseparser'
24
+ require 'strscan'
25
+
26
+ module Newstile
27
+
28
+ module Parser
29
+
30
+ # Used for parsing a HTML document.
31
+ class Html < Base
32
+
33
+ # Contains all constants that are used when parsing.
34
+ module Constants
35
+ #:stopdoc:
36
+ # The following regexps are based on the ones used by REXML, with some slight modifications.
37
+ HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
38
+ HTML_COMMENT_RE = /<!--(.*?)-->/m
39
+ HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
40
+ HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/m
41
+ HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/m
42
+ HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::UNAME_STR})\s*>/m
43
+ HTML_ENTITY_RE = /&([\w:][\-\w\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
44
+
45
+
46
+ HTML_PARSE_AS_BLOCK = %w{applet button blockquote body colgroup dd div dl fieldset form iframe li
47
+ map noscript object ol table tbody thead tfoot tr td ul}
48
+ HTML_PARSE_AS_SPAN = %w{a abbr acronym address b bdo big cite caption del dfn dt em
49
+ h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
50
+ rp rt rtc ruby samp select small span strong sub sup th tt var}
51
+ HTML_PARSE_AS_RAW = %w{script math option textarea pre code}
52
+
53
+ HTML_PARSE_AS = Hash.new {|h,k| h[k] = :raw}
54
+ HTML_PARSE_AS_BLOCK.each {|i| HTML_PARSE_AS[i] = :block}
55
+ HTML_PARSE_AS_SPAN.each {|i| HTML_PARSE_AS[i] = :span}
56
+ HTML_PARSE_AS_RAW.each {|i| HTML_PARSE_AS[i] = :raw}
57
+
58
+ # Some HTML elements like script belong to both categories (i.e. are valid in block and
59
+ # span HTML) and don't appear therefore!
60
+ HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
61
+ ins kbd label option q rb rbc rp rt rtc ruby samp select small span
62
+ strong sub sup textarea tt var}
63
+ HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
64
+ figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend listing menu
65
+ li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
66
+ HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
67
+ end
68
+
69
+
70
+ # Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
71
+ # functionality. The only thing that must be provided by the class are instance variable
72
+ # <tt>@stack</tt> for storing needed state and <tt>@src</tt> (instance of StringScanner) for
73
+ # the actual parsing.
74
+ module Parser
75
+
76
+ include Constants
77
+
78
+ # Process the HTML start tag that has already be scanned/checked. Does the common processing
79
+ # steps and then yields to the caller for further processing.
80
+ def handle_html_start_tag
81
+ name = @src[1]
82
+ closed = !@src[4].nil?
83
+ attrs = Utils::OrderedHash.new
84
+ @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr] = val}
85
+
86
+ el = Element.new(:html_element, name, attrs, :category => :block)
87
+ @tree.children << el
88
+
89
+ if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
90
+ warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
91
+ closed = true
92
+ end
93
+ if name == 'script'
94
+ handle_html_script_tag
95
+ yield(el, true)
96
+ else
97
+ yield(el, closed)
98
+ end
99
+ end
100
+
101
+ def handle_html_script_tag
102
+ curpos = @src.pos
103
+ if result = @src.scan_until(/(?=<\/script\s*>)/m)
104
+ add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
105
+ @src.scan(HTML_TAG_CLOSE_RE)
106
+ else
107
+ add_text(@src.scan(/.*/m), @tree.children.last, :raw)
108
+ warning("Found no end tag for 'script' - auto-closing it")
109
+ end
110
+ end
111
+
112
+ HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/
113
+
114
+ # Parse raw HTML from the current source position, storing the found elements in +el+.
115
+ # Parsing continues until one of the following criteria are fulfilled:
116
+ #
117
+ # - The end of the document is reached.
118
+ # - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
119
+ # element).
120
+ #
121
+ # When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
122
+ # providing the block given to this method.
123
+ def parse_raw_html(el, &block)
124
+ @stack.push(@tree)
125
+ @tree = el
126
+
127
+ done = false
128
+ while !@src.eos? && !done
129
+ if result = @src.scan_until(HTML_RAW_START)
130
+ add_text(result, @tree, :text)
131
+ if result = @src.scan(HTML_COMMENT_RE)
132
+ @tree.children << Element.new(:xml_comment, result, nil, :category => :block)
133
+ elsif result = @src.scan(HTML_INSTRUCTION_RE)
134
+ @tree.children << Element.new(:xml_pi, result, nil, :category => :block)
135
+ elsif @src.scan(HTML_TAG_RE)
136
+ handle_html_start_tag(&block)
137
+ elsif @src.scan(HTML_TAG_CLOSE_RE)
138
+ if @tree.value == @src[1]
139
+ done = true
140
+ else
141
+ warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")
142
+ end
143
+ else
144
+ add_text(@src.scan(/./), @tree, :text)
145
+ end
146
+ else
147
+ result = @src.scan(/.*/m)
148
+ add_text(result, @tree, :text)
149
+ warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
150
+ done = true
151
+ end
152
+ end
153
+
154
+ @tree = @stack.pop
155
+ end
156
+
157
+ end
158
+
159
+
160
+ # Converts HTML elements to native elements if possible.
161
+ class ElementConverter
162
+
163
+ include Constants
164
+ include ::Newstile::Utils::Entities
165
+
166
+ REMOVE_TEXT_CHILDREN = %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
167
+ WRAP_TEXT_CHILDREN = %w{body section nav article aside header footer address div li dd blockquote figure
168
+ figcaption fieldset form}
169
+ REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
170
+ div li dd blockquote figure figcaption td th fieldset form}
171
+ STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
172
+ header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
173
+ SIMPLE_ELEMENTS = %w{em strong blockquote hr br img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
174
+
175
+ def initialize(doc)
176
+ @doc = doc
177
+ end
178
+
179
+ # Convert the element +el+ and its children.
180
+ def process(el, do_conversion = true, preserve_text = false, parent = nil)
181
+ case el.type
182
+ when :xml_comment, :xml_pi, :html_doctype
183
+ ptype = if parent.nil?
184
+ 'div'
185
+ else
186
+ case parent.type
187
+ when :html_element then parent.value
188
+ when :code_span then 'code'
189
+ when :code_block then 'pre'
190
+ when :header then 'h1'
191
+ else parent.type.to_s
192
+ end
193
+ end
194
+ el.options = {:category => HTML_PARSE_AS_SPAN.include?(ptype) ? :span : :block}
195
+ return
196
+ when :html_element
197
+ else return
198
+ end
199
+
200
+ type = el.value
201
+ remove_text_children(el) if REMOVE_TEXT_CHILDREN.include?(type)
202
+
203
+ mname = "convert_#{el.value}"
204
+ if do_conversion && self.class.method_defined?(mname)
205
+ send(mname, el)
206
+ elsif do_conversion && SIMPLE_ELEMENTS.include?(type)
207
+ set_basics(el, type.intern, HTML_SPAN_ELEMENTS.include?(type) ? :span : :block)
208
+ process_children(el, do_conversion, preserve_text)
209
+ else
210
+ process_html_element(el, do_conversion, preserve_text)
211
+ end
212
+
213
+ strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
214
+ remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
215
+ wrap_text_children(el) if WRAP_TEXT_CHILDREN.include?(type)
216
+ end
217
+
218
+ def process_children(el, do_conversion = true, preserve_text = false)
219
+ el.children.map! do |c|
220
+ if c.type == :text
221
+ process_text(c.value, preserve_text)
222
+ else
223
+ process(c, do_conversion, preserve_text, el)
224
+ c
225
+ end
226
+ end.flatten!
227
+ end
228
+
229
+ # Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
230
+ # entities in entity elements.
231
+ def process_text(raw, preserve = false)
232
+ raw.gsub!(/\s+/, ' ') unless preserve
233
+ src = StringScanner.new(raw)
234
+ result = []
235
+ while !src.eos?
236
+ if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
237
+ result << Element.new(:text, tmp)
238
+ src.scan(HTML_ENTITY_RE)
239
+ val = src[1] || (src[2] && src[2].to_i) || src[3].hex
240
+ result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
241
+ Element.new(:smart_quote, val.intern)
242
+ elsif %w{mdash ndash hellip laquo raquo}.include?(val)
243
+ Element.new(:typographic_sym, val.intern)
244
+ else
245
+ Element.new(:entity, entity(val), nil, :original => src.matched)
246
+ end
247
+ else
248
+ result << Element.new(:text, src.scan(/.*/m))
249
+ end
250
+ end
251
+ result
252
+ end
253
+
254
+ def process_html_element(el, do_conversion = true, preserve_text = false)
255
+ el.options = {:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
256
+ :parse_type => HTML_PARSE_AS[el.value]
257
+ }
258
+ process_children(el, do_conversion, preserve_text)
259
+ end
260
+
261
+ def remove_text_children(el)
262
+ el.children.delete_if {|c| c.type == :text}
263
+ end
264
+
265
+ def wrap_text_children(el)
266
+ tmp = []
267
+ last_is_p = false
268
+ el.children.each do |c|
269
+ if c.options[:category] != :block || c.type == :text
270
+ if !last_is_p
271
+ tmp << Element.new(:p, nil, nil, :transparent => true)
272
+ last_is_p = true
273
+ end
274
+ tmp.last.children << c
275
+ tmp
276
+ else
277
+ tmp << c
278
+ last_is_p = false
279
+ end
280
+ end
281
+ el.children = tmp
282
+ end
283
+
284
+ def strip_whitespace(el)
285
+ return if el.children.empty?
286
+ if el.children.first.type == :text
287
+ el.children.first.value.lstrip!
288
+ end
289
+ if el.children.last.type == :text
290
+ el.children.last.value.rstrip!
291
+ end
292
+ end
293
+
294
+ def remove_whitespace_children(el)
295
+ i = -1
296
+ el.children.delete_if do |c|
297
+ i += 1
298
+ c.type == :text && c.value.strip.empty? &&
299
+ (i == 0 || i == el.children.length - 1 || (el.children[i-1].options[:category] == :block &&
300
+ el.children[i+1].options[:category] == :block))
301
+ end
302
+ end
303
+
304
+ def set_basics(el, type, category, opts = {})
305
+ el.type = type
306
+ el.options = {:category => category}.merge(opts)
307
+ el.value = nil
308
+ end
309
+
310
+ def extract_text(el, raw)
311
+ raw << el.value.to_s if el.type == :text
312
+ el.children.each {|c| extract_text(c, raw)}
313
+ end
314
+
315
+ def convert_a(el)
316
+ if el.attr['href']
317
+ set_basics(el, :a, :span)
318
+ process_children(el)
319
+ else
320
+ process_html_element(el, false)
321
+ end
322
+ end
323
+
324
+ def convert_b(el)
325
+ set_basics(el, :strong, :span)
326
+ process_children(el)
327
+ end
328
+
329
+ def convert_i(el)
330
+ set_basics(el, :em, :span)
331
+ process_children(el)
332
+ end
333
+
334
+ def convert_h1(el)
335
+ set_basics(el, :header, :block, :level => el.value[1..1].to_i)
336
+ extract_text(el, el.options[:raw_text] = '')
337
+ process_children(el)
338
+ end
339
+ %w{h2 h3 h4 h5 h6}.each do |i|
340
+ alias_method("convert_#{i}".to_sym, :convert_h1)
341
+ end
342
+
343
+ def convert_code(el)
344
+ raw = ''
345
+ extract_text(el, raw)
346
+ result = process_text(raw, true)
347
+ begin
348
+ str = result.inject('') do |mem, c|
349
+ if c.type == :text
350
+ mem << c.value
351
+ elsif c.type == :entity
352
+ if RUBY_VERSION >= '1.9'
353
+ mem << c.value.char.encode(@doc.parse_infos[:encoding])
354
+ elsif [60, 62, 34, 38].include?(c.value.code_point)
355
+ mem << c.value.code_point.chr
356
+ end
357
+ elsif c.type == :smart_quote || c.type == :typographic_sym
358
+ mem << entity(c.value.to_s).char.encode(@doc.parse_infos[:encoding])
359
+ else
360
+ raise "Bug - please report"
361
+ end
362
+ end
363
+ result.clear
364
+ result << Element.new(:text, str)
365
+ rescue
366
+ end
367
+ if result.length > 1 || result.first.type != :text
368
+ process_html_element(el, false, true)
369
+ else
370
+ if el.value == 'code'
371
+ set_basics(el, :codespan, :span)
372
+ else
373
+ set_basics(el, :codeblock, :block)
374
+ end
375
+ el.value = result.first.value
376
+ el.children.clear
377
+ end
378
+ end
379
+ alias :convert_pre :convert_code
380
+
381
+ def convert_table(el)
382
+ if !is_simple_table?(el)
383
+ process_html_element(el, false)
384
+ return
385
+ end
386
+ process_children(el)
387
+ set_basics(el, :table, :block)
388
+ el.options[:alignment] = []
389
+ calc_alignment = lambda do |c|
390
+ if c.type == :tr && el.options[:alignment].empty?
391
+ el.options[:alignment] = [:default] * c.children.length
392
+ break
393
+ else
394
+ c.children.each {|cc| calc_alignment.call(cc)}
395
+ end
396
+ end
397
+ calc_alignment.call(el)
398
+ if el.children.first.type == :tr
399
+ tbody = Element.new(:tbody, nil, nil, :category => :block)
400
+ tbody.children = el.children
401
+ el.children = [tbody]
402
+ end
403
+ end
404
+
405
+ def is_simple_table?(el)
406
+ only_phrasing_content = lambda do |c|
407
+ c.children.all? do |cc|
408
+ (cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
409
+ end
410
+ end
411
+ check_cells = Proc.new do |c|
412
+ if c.value == 'th' || c.value == 'td'
413
+ return false if !only_phrasing_content.call(c)
414
+ else
415
+ c.children.each {|cc| check_cells.call(cc)}
416
+ end
417
+ end
418
+ check_cells.call(el)
419
+
420
+ check_rows = lambda do |t, type|
421
+ t.children.all? {|r| (r.value == 'tr' || r.type == :text) && r.children.all? {|c| c.value == type || c.type == :text}}
422
+ end
423
+ check_rows.call(el, 'td') ||
424
+ (el.children.all? do |t|
425
+ t.type == :text || (t.value == 'thead' && check_rows.call(t, 'th')) ||
426
+ ((t.value == 'tfoot' || t.value == 'tbody') && check_rows.call(t, 'td'))
427
+ end && el.children.any? {|t| t.value == 'tbody'})
428
+ end
429
+
430
+ def convert_div(el)
431
+ if !is_math_tag?(el)
432
+ process_html_element(el)
433
+ else
434
+ handle_math_tag(el)
435
+ end
436
+ end
437
+ alias :convert_span :convert_div
438
+
439
+ def is_math_tag?(el)
440
+ el.attr['class'].to_s =~ /\bmath\b/ &&
441
+ el.children.size == 1 && el.children.first.type == :text
442
+ end
443
+
444
+ def handle_math_tag(el)
445
+ set_basics(el, :math, (el.value == 'div' ? :block : :span))
446
+ el.value = el.children.shift.value
447
+ if el.attr['class'] =~ /^\s*math\s*$/
448
+ el.attr.delete('class')
449
+ else
450
+ el.attr['class'].sub!(/\s?math/, '')
451
+ end
452
+ el.value.gsub!(/&(amp|quot|gt|lt);/) do |m|
453
+ case m
454
+ when '&amp;' then '&'
455
+ when '&quot;' then '"'
456
+ when '&gt;' then '>'
457
+ when '&lt;' then '<'
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ include Parser
464
+
465
+ # Parse +source+ as HTML document and return the created +tree+.
466
+ def parse(source)
467
+ @stack = []
468
+ @tree = Element.new(:root)
469
+ @src = StringScanner.new(adapt_source(source))
470
+
471
+ while true
472
+ if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
473
+ @tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
474
+ elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
475
+ @tree.children << Element.new(:html_doctype, result.strip, nil, :category => :block)
476
+ elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
477
+ @tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
478
+ else
479
+ break
480
+ end
481
+ end
482
+
483
+ tag_handler = lambda do |c, closed|
484
+ parse_raw_html(c, &tag_handler) if !closed
485
+ end
486
+ parse_raw_html(@tree, &tag_handler)
487
+
488
+ ec = ElementConverter.new(@doc)
489
+ @tree.children.each {|c| ec.process(c)}
490
+ ec.remove_whitespace_children(@tree)
491
+ @tree
492
+ end
493
+
494
+ end
495
+
496
+ end
497
+
498
+ end
499
+