newstile 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (405) hide show
  1. data/AUTHORS +1 -0
  2. data/CONTRIBUTERS +6 -0
  3. data/COPYING +24 -0
  4. data/ChangeLog +5489 -0
  5. data/GPL +674 -0
  6. data/README +31 -0
  7. data/Rakefile +342 -0
  8. data/VERSION +1 -0
  9. data/benchmark/benchmark.rb +34 -0
  10. data/benchmark/generate_data.rb +112 -0
  11. data/benchmark/historic-jruby-1.4.0.dat +7 -0
  12. data/benchmark/historic-ruby-1.8.6.dat +7 -0
  13. data/benchmark/historic-ruby-1.8.7.dat +7 -0
  14. data/benchmark/historic-ruby-1.9.1p243.dat +7 -0
  15. data/benchmark/historic-ruby-1.9.2dev.dat +7 -0
  16. data/benchmark/mdbasics.text +306 -0
  17. data/benchmark/mdsyntax.text +888 -0
  18. data/benchmark/static-jruby-1.4.0.dat +7 -0
  19. data/benchmark/static-ruby-1.8.6.dat +7 -0
  20. data/benchmark/static-ruby-1.8.7.dat +7 -0
  21. data/benchmark/static-ruby-1.9.1p243.dat +7 -0
  22. data/benchmark/static-ruby-1.9.2dev.dat +7 -0
  23. data/benchmark/testing.sh +9 -0
  24. data/benchmark/timing.sh +10 -0
  25. data/bin/newstile +82 -0
  26. data/data/newstile/document.html +18 -0
  27. data/data/newstile/document.latex +43 -0
  28. data/doc/default.scss.css +519 -0
  29. data/doc/default.template +80 -0
  30. data/doc/documentation.page +72 -0
  31. data/doc/index.page +96 -0
  32. data/doc/installation.page +90 -0
  33. data/doc/links.markdown +6 -0
  34. data/doc/news.feed +10 -0
  35. data/doc/news.page +28 -0
  36. data/doc/quickref.page +564 -0
  37. data/doc/syntax.page +1615 -0
  38. data/doc/tests.page +51 -0
  39. data/doc/virtual +2 -0
  40. data/lib/newstile.rb +23 -0
  41. data/lib/newstile/compatibility.rb +34 -0
  42. data/lib/newstile/converter.rb +43 -0
  43. data/lib/newstile/converter/base.rb +111 -0
  44. data/lib/newstile/converter/html.rb +405 -0
  45. data/lib/newstile/converter/latex.rb +577 -0
  46. data/lib/newstile/converter/markdown.rb +426 -0
  47. data/lib/newstile/converter/newstile.rb +426 -0
  48. data/lib/newstile/document.rb +168 -0
  49. data/lib/newstile/error.rb +27 -0
  50. data/lib/newstile/options.rb +296 -0
  51. data/lib/newstile/parser.rb +39 -0
  52. data/lib/newstile/parser/base.rb +94 -0
  53. data/lib/newstile/parser/html.rb +499 -0
  54. data/lib/newstile/parser/newstile.rb +325 -0
  55. data/lib/newstile/parser/newstile/abbreviation.rb +66 -0
  56. data/lib/newstile/parser/newstile/attribute_list.rb +111 -0
  57. data/lib/newstile/parser/newstile/autolink.rb +54 -0
  58. data/lib/newstile/parser/newstile/blank_line.rb +43 -0
  59. data/lib/newstile/parser/newstile/block_boundary.rb +46 -0
  60. data/lib/newstile/parser/newstile/blockquote.rb +63 -0
  61. data/lib/newstile/parser/newstile/codeblock.rb +60 -0
  62. data/lib/newstile/parser/newstile/codespan.rb +57 -0
  63. data/lib/newstile/parser/newstile/emphasis.rb +70 -0
  64. data/lib/newstile/parser/newstile/eob.rb +39 -0
  65. data/lib/newstile/parser/newstile/escaped_chars.rb +38 -0
  66. data/lib/newstile/parser/newstile/extension.rb +116 -0
  67. data/lib/newstile/parser/newstile/footnote.rb +74 -0
  68. data/lib/newstile/parser/newstile/header.rb +84 -0
  69. data/lib/newstile/parser/newstile/horizontal_rule.rb +39 -0
  70. data/lib/newstile/parser/newstile/html.rb +175 -0
  71. data/lib/newstile/parser/newstile/html_entity.rb +39 -0
  72. data/lib/newstile/parser/newstile/line_break.rb +38 -0
  73. data/lib/newstile/parser/newstile/link.rb +177 -0
  74. data/lib/newstile/parser/newstile/list.rb +239 -0
  75. data/lib/newstile/parser/newstile/math.rb +64 -0
  76. data/lib/newstile/parser/newstile/paragraph.rb +55 -0
  77. data/lib/newstile/parser/newstile/smart_quotes.rb +214 -0
  78. data/lib/newstile/parser/newstile/table.rb +134 -0
  79. data/lib/newstile/parser/newstile/typographic_symbol.rb +54 -0
  80. data/lib/newstile/utils.rb +37 -0
  81. data/lib/newstile/utils/entities.rb +336 -0
  82. data/lib/newstile/utils/html.rb +75 -0
  83. data/lib/newstile/utils/ordered_hash.rb +79 -0
  84. data/lib/newstile/version.rb +28 -0
  85. data/man/man1/newstile.1 +246 -0
  86. data/setup.rb +1585 -0
  87. data/test/run_tests.rb +59 -0
  88. data/test/test_files.rb +162 -0
  89. data/test/testcases/block/01_blank_line/spaces.html +1 -0
  90. data/test/testcases/block/01_blank_line/spaces.text +3 -0
  91. data/test/testcases/block/01_blank_line/tabs.html +1 -0
  92. data/test/testcases/block/01_blank_line/tabs.text +6 -0
  93. data/test/testcases/block/02_eob/beginning.html +1 -0
  94. data/test/testcases/block/02_eob/beginning.text +3 -0
  95. data/test/testcases/block/02_eob/end.html +1 -0
  96. data/test/testcases/block/02_eob/end.text +3 -0
  97. data/test/testcases/block/02_eob/middle.html +1 -0
  98. data/test/testcases/block/02_eob/middle.text +5 -0
  99. data/test/testcases/block/03_paragraph/indented.html +18 -0
  100. data/test/testcases/block/03_paragraph/indented.text +19 -0
  101. data/test/testcases/block/03_paragraph/no_newline_at_end.html +5 -0
  102. data/test/testcases/block/03_paragraph/no_newline_at_end.text +5 -0
  103. data/test/testcases/block/03_paragraph/one_para.html +1 -0
  104. data/test/testcases/block/03_paragraph/one_para.text +1 -0
  105. data/test/testcases/block/03_paragraph/two_para.html +4 -0
  106. data/test/testcases/block/03_paragraph/two_para.text +4 -0
  107. data/test/testcases/block/04_header/atx_header.html +37 -0
  108. data/test/testcases/block/04_header/atx_header.text +34 -0
  109. data/test/testcases/block/04_header/atx_header_no_newline_at_end.html +1 -0
  110. data/test/testcases/block/04_header/atx_header_no_newline_at_end.text +1 -0
  111. data/test/testcases/block/04_header/setext_header.html +30 -0
  112. data/test/testcases/block/04_header/setext_header.html.19 +30 -0
  113. data/test/testcases/block/04_header/setext_header.text +36 -0
  114. data/test/testcases/block/04_header/setext_header_no_newline_at_end.html +1 -0
  115. data/test/testcases/block/04_header/setext_header_no_newline_at_end.text +2 -0
  116. data/test/testcases/block/04_header/with_auto_id_prefix.html +3 -0
  117. data/test/testcases/block/04_header/with_auto_id_prefix.options +2 -0
  118. data/test/testcases/block/04_header/with_auto_id_prefix.text +3 -0
  119. data/test/testcases/block/04_header/with_auto_ids.html +17 -0
  120. data/test/testcases/block/04_header/with_auto_ids.options +1 -0
  121. data/test/testcases/block/04_header/with_auto_ids.text +19 -0
  122. data/test/testcases/block/05_blockquote/indented.html +25 -0
  123. data/test/testcases/block/05_blockquote/indented.text +14 -0
  124. data/test/testcases/block/05_blockquote/lazy.html +34 -0
  125. data/test/testcases/block/05_blockquote/lazy.text +20 -0
  126. data/test/testcases/block/05_blockquote/nested.html +10 -0
  127. data/test/testcases/block/05_blockquote/nested.text +6 -0
  128. data/test/testcases/block/05_blockquote/no_newline_at_end.html +4 -0
  129. data/test/testcases/block/05_blockquote/no_newline_at_end.text +2 -0
  130. data/test/testcases/block/05_blockquote/with_code_blocks.html +15 -0
  131. data/test/testcases/block/05_blockquote/with_code_blocks.text +11 -0
  132. data/test/testcases/block/06_codeblock/error.html +4 -0
  133. data/test/testcases/block/06_codeblock/error.text +4 -0
  134. data/test/testcases/block/06_codeblock/lazy.html +4 -0
  135. data/test/testcases/block/06_codeblock/lazy.text +5 -0
  136. data/test/testcases/block/06_codeblock/no_newline_at_end.html +2 -0
  137. data/test/testcases/block/06_codeblock/no_newline_at_end.text +1 -0
  138. data/test/testcases/block/06_codeblock/no_newline_at_end_1.html +2 -0
  139. data/test/testcases/block/06_codeblock/no_newline_at_end_1.text +2 -0
  140. data/test/testcases/block/06_codeblock/normal.html +13 -0
  141. data/test/testcases/block/06_codeblock/normal.text +10 -0
  142. data/test/testcases/block/06_codeblock/tilde_syntax.html +7 -0
  143. data/test/testcases/block/06_codeblock/tilde_syntax.text +9 -0
  144. data/test/testcases/block/06_codeblock/whitespace.html +3 -0
  145. data/test/testcases/block/06_codeblock/whitespace.text +3 -0
  146. data/test/testcases/block/06_codeblock/with_blank_line.html +13 -0
  147. data/test/testcases/block/06_codeblock/with_blank_line.text +12 -0
  148. data/test/testcases/block/06_codeblock/with_eob_marker.html +6 -0
  149. data/test/testcases/block/06_codeblock/with_eob_marker.text +5 -0
  150. data/test/testcases/block/06_codeblock/with_ial.html +6 -0
  151. data/test/testcases/block/06_codeblock/with_ial.text +5 -0
  152. data/test/testcases/block/07_horizontal_rule/error.html +7 -0
  153. data/test/testcases/block/07_horizontal_rule/error.html.19 +7 -0
  154. data/test/testcases/block/07_horizontal_rule/error.text +7 -0
  155. data/test/testcases/block/07_horizontal_rule/normal.html +17 -0
  156. data/test/testcases/block/07_horizontal_rule/normal.text +17 -0
  157. data/test/testcases/block/07_horizontal_rule/sepspaces.html +3 -0
  158. data/test/testcases/block/07_horizontal_rule/sepspaces.text +3 -0
  159. data/test/testcases/block/07_horizontal_rule/septabs.html +3 -0
  160. data/test/testcases/block/07_horizontal_rule/septabs.text +3 -0
  161. data/test/testcases/block/08_list/escaping.html +17 -0
  162. data/test/testcases/block/08_list/escaping.text +17 -0
  163. data/test/testcases/block/08_list/item_ial.html +7 -0
  164. data/test/testcases/block/08_list/item_ial.text +5 -0
  165. data/test/testcases/block/08_list/lazy.html +39 -0
  166. data/test/testcases/block/08_list/lazy.text +29 -0
  167. data/test/testcases/block/08_list/list_and_hr.html +9 -0
  168. data/test/testcases/block/08_list/list_and_hr.text +5 -0
  169. data/test/testcases/block/08_list/list_and_others.html +40 -0
  170. data/test/testcases/block/08_list/list_and_others.text +26 -0
  171. data/test/testcases/block/08_list/mixed.html +117 -0
  172. data/test/testcases/block/08_list/mixed.text +66 -0
  173. data/test/testcases/block/08_list/nested.html +17 -0
  174. data/test/testcases/block/08_list/nested.text +7 -0
  175. data/test/testcases/block/08_list/other_first_element.html +39 -0
  176. data/test/testcases/block/08_list/other_first_element.text +18 -0
  177. data/test/testcases/block/08_list/simple_ol.html +19 -0
  178. data/test/testcases/block/08_list/simple_ol.text +13 -0
  179. data/test/testcases/block/08_list/simple_ul.html +48 -0
  180. data/test/testcases/block/08_list/simple_ul.text +36 -0
  181. data/test/testcases/block/08_list/single_item.html +3 -0
  182. data/test/testcases/block/08_list/single_item.text +1 -0
  183. data/test/testcases/block/08_list/special_cases.html +55 -0
  184. data/test/testcases/block/08_list/special_cases.text +35 -0
  185. data/test/testcases/block/09_html/comment.html +18 -0
  186. data/test/testcases/block/09_html/comment.text +15 -0
  187. data/test/testcases/block/09_html/content_model/deflists.html +6 -0
  188. data/test/testcases/block/09_html/content_model/deflists.options +1 -0
  189. data/test/testcases/block/09_html/content_model/deflists.text +6 -0
  190. data/test/testcases/block/09_html/content_model/tables.html +14 -0
  191. data/test/testcases/block/09_html/content_model/tables.options +1 -0
  192. data/test/testcases/block/09_html/content_model/tables.text +14 -0
  193. data/test/testcases/block/09_html/html_and_codeblocks.html +15 -0
  194. data/test/testcases/block/09_html/html_and_codeblocks.options +1 -0
  195. data/test/testcases/block/09_html/html_and_codeblocks.text +13 -0
  196. data/test/testcases/block/09_html/html_to_native/code.html +10 -0
  197. data/test/testcases/block/09_html/html_to_native/code.text +9 -0
  198. data/test/testcases/block/09_html/html_to_native/comment.html +7 -0
  199. data/test/testcases/block/09_html/html_to_native/comment.text +8 -0
  200. data/test/testcases/block/09_html/html_to_native/emphasis.html +3 -0
  201. data/test/testcases/block/09_html/html_to_native/emphasis.text +3 -0
  202. data/test/testcases/block/09_html/html_to_native/entity.html +1 -0
  203. data/test/testcases/block/09_html/html_to_native/entity.text +1 -0
  204. data/test/testcases/block/09_html/html_to_native/header.html +6 -0
  205. data/test/testcases/block/09_html/html_to_native/header.options +2 -0
  206. data/test/testcases/block/09_html/html_to_native/header.text +6 -0
  207. data/test/testcases/block/09_html/html_to_native/list_dl.html +8 -0
  208. data/test/testcases/block/09_html/html_to_native/list_dl.text +8 -0
  209. data/test/testcases/block/09_html/html_to_native/list_ol.html +15 -0
  210. data/test/testcases/block/09_html/html_to_native/list_ol.text +17 -0
  211. data/test/testcases/block/09_html/html_to_native/list_ul.html +19 -0
  212. data/test/testcases/block/09_html/html_to_native/list_ul.text +22 -0
  213. data/test/testcases/block/09_html/html_to_native/options +1 -0
  214. data/test/testcases/block/09_html/html_to_native/paragraph.html +3 -0
  215. data/test/testcases/block/09_html/html_to_native/paragraph.text +4 -0
  216. data/test/testcases/block/09_html/html_to_native/table_normal.html +14 -0
  217. data/test/testcases/block/09_html/html_to_native/table_normal.text +12 -0
  218. data/test/testcases/block/09_html/html_to_native/table_simple.html +48 -0
  219. data/test/testcases/block/09_html/html_to_native/table_simple.text +56 -0
  220. data/test/testcases/block/09_html/html_to_native/typography.html +1 -0
  221. data/test/testcases/block/09_html/html_to_native/typography.html.19 +1 -0
  222. data/test/testcases/block/09_html/html_to_native/typography.text +1 -0
  223. data/test/testcases/block/09_html/invalid_html_1.html +5 -0
  224. data/test/testcases/block/09_html/invalid_html_1.text +5 -0
  225. data/test/testcases/block/09_html/invalid_html_2.html +5 -0
  226. data/test/testcases/block/09_html/invalid_html_2.text +5 -0
  227. data/test/testcases/block/09_html/markdown_attr.html +38 -0
  228. data/test/testcases/block/09_html/markdown_attr.text +38 -0
  229. data/test/testcases/block/09_html/not_parsed.html +24 -0
  230. data/test/testcases/block/09_html/not_parsed.text +24 -0
  231. data/test/testcases/block/09_html/parse_as_raw.html +30 -0
  232. data/test/testcases/block/09_html/parse_as_raw.options +1 -0
  233. data/test/testcases/block/09_html/parse_as_raw.text +29 -0
  234. data/test/testcases/block/09_html/parse_as_span.html +12 -0
  235. data/test/testcases/block/09_html/parse_as_span.options +1 -0
  236. data/test/testcases/block/09_html/parse_as_span.text +9 -0
  237. data/test/testcases/block/09_html/parse_block_html.html +21 -0
  238. data/test/testcases/block/09_html/parse_block_html.options +1 -0
  239. data/test/testcases/block/09_html/parse_block_html.text +17 -0
  240. data/test/testcases/block/09_html/processing_instruction.html +13 -0
  241. data/test/testcases/block/09_html/processing_instruction.text +12 -0
  242. data/test/testcases/block/09_html/simple.html +64 -0
  243. data/test/testcases/block/09_html/simple.html.19 +64 -0
  244. data/test/testcases/block/09_html/simple.options +1 -0
  245. data/test/testcases/block/09_html/simple.text +59 -0
  246. data/test/testcases/block/10_ald/simple.html +2 -0
  247. data/test/testcases/block/10_ald/simple.text +8 -0
  248. data/test/testcases/block/11_ial/auto_id_and_ial.html +1 -0
  249. data/test/testcases/block/11_ial/auto_id_and_ial.options +1 -0
  250. data/test/testcases/block/11_ial/auto_id_and_ial.text +2 -0
  251. data/test/testcases/block/11_ial/simple.html +25 -0
  252. data/test/testcases/block/11_ial/simple.text +34 -0
  253. data/test/testcases/block/12_extension/comment.html +8 -0
  254. data/test/testcases/block/12_extension/comment.text +12 -0
  255. data/test/testcases/block/12_extension/ignored.html +8 -0
  256. data/test/testcases/block/12_extension/ignored.text +8 -0
  257. data/test/testcases/block/12_extension/nomarkdown.html +10 -0
  258. data/test/testcases/block/12_extension/nomarkdown.kramdown +20 -0
  259. data/test/testcases/block/12_extension/nomarkdown.latex +13 -0
  260. data/test/testcases/block/12_extension/nomarkdown.text +21 -0
  261. data/test/testcases/block/12_extension/options.html +21 -0
  262. data/test/testcases/block/12_extension/options.text +21 -0
  263. data/test/testcases/block/12_extension/options2.html +10 -0
  264. data/test/testcases/block/12_extension/options2.text +5 -0
  265. data/test/testcases/block/12_extension/options3.html +7 -0
  266. data/test/testcases/block/12_extension/options3.text +7 -0
  267. data/test/testcases/block/13_definition_list/definition_at_beginning.html +1 -0
  268. data/test/testcases/block/13_definition_list/definition_at_beginning.text +1 -0
  269. data/test/testcases/block/13_definition_list/item_ial.html +12 -0
  270. data/test/testcases/block/13_definition_list/item_ial.text +8 -0
  271. data/test/testcases/block/13_definition_list/multiple_terms.html +13 -0
  272. data/test/testcases/block/13_definition_list/multiple_terms.text +10 -0
  273. data/test/testcases/block/13_definition_list/no_def_list.html +2 -0
  274. data/test/testcases/block/13_definition_list/no_def_list.text +2 -0
  275. data/test/testcases/block/13_definition_list/para_wrapping.html +10 -0
  276. data/test/testcases/block/13_definition_list/para_wrapping.text +6 -0
  277. data/test/testcases/block/13_definition_list/separated_by_eob.html +8 -0
  278. data/test/testcases/block/13_definition_list/separated_by_eob.text +5 -0
  279. data/test/testcases/block/13_definition_list/simple.html +8 -0
  280. data/test/testcases/block/13_definition_list/simple.text +7 -0
  281. data/test/testcases/block/13_definition_list/styled_terms.html +4 -0
  282. data/test/testcases/block/13_definition_list/styled_terms.text +2 -0
  283. data/test/testcases/block/13_definition_list/too_much_space.html +3 -0
  284. data/test/testcases/block/13_definition_list/too_much_space.text +4 -0
  285. data/test/testcases/block/13_definition_list/with_blocks.html +38 -0
  286. data/test/testcases/block/13_definition_list/with_blocks.text +24 -0
  287. data/test/testcases/block/14_table/errors.html +8 -0
  288. data/test/testcases/block/14_table/errors.text +9 -0
  289. data/test/testcases/block/14_table/footer.html +65 -0
  290. data/test/testcases/block/14_table/footer.text +25 -0
  291. data/test/testcases/block/14_table/header.html +103 -0
  292. data/test/testcases/block/14_table/header.text +32 -0
  293. data/test/testcases/block/14_table/no_table.html +3 -0
  294. data/test/testcases/block/14_table/no_table.text +3 -0
  295. data/test/testcases/block/14_table/simple.html +139 -0
  296. data/test/testcases/block/14_table/simple.text +38 -0
  297. data/test/testcases/block/15_math/normal.html +26 -0
  298. data/test/testcases/block/15_math/normal.text +28 -0
  299. data/test/testcases/block/16_toc/no_toc_depth.html +33 -0
  300. data/test/testcases/block/16_toc/no_toc_depth.options +1 -0
  301. data/test/testcases/block/16_toc/no_toc_depth.text +16 -0
  302. data/test/testcases/block/16_toc/toc_depth_2.html +24 -0
  303. data/test/testcases/block/16_toc/toc_depth_2.options +1 -0
  304. data/test/testcases/block/16_toc/toc_depth_2.text +16 -0
  305. data/test/testcases/encoding.html +46 -0
  306. data/test/testcases/encoding.text +28 -0
  307. data/test/testcases/span/01_link/empty.html +5 -0
  308. data/test/testcases/span/01_link/empty.text +5 -0
  309. data/test/testcases/span/01_link/image_in_a.html +5 -0
  310. data/test/testcases/span/01_link/image_in_a.text +5 -0
  311. data/test/testcases/span/01_link/imagelinks.html +14 -0
  312. data/test/testcases/span/01_link/imagelinks.text +16 -0
  313. data/test/testcases/span/01_link/inline.html +46 -0
  314. data/test/testcases/span/01_link/inline.html.19 +46 -0
  315. data/test/testcases/span/01_link/inline.text +48 -0
  316. data/test/testcases/span/01_link/link_defs.html +9 -0
  317. data/test/testcases/span/01_link/link_defs.text +26 -0
  318. data/test/testcases/span/01_link/links_with_angle_brackets.html +3 -0
  319. data/test/testcases/span/01_link/links_with_angle_brackets.text +3 -0
  320. data/test/testcases/span/01_link/reference.html +35 -0
  321. data/test/testcases/span/01_link/reference.html.19 +35 -0
  322. data/test/testcases/span/01_link/reference.text +47 -0
  323. data/test/testcases/span/02_emphasis/empty.html +3 -0
  324. data/test/testcases/span/02_emphasis/empty.text +3 -0
  325. data/test/testcases/span/02_emphasis/errors.html +9 -0
  326. data/test/testcases/span/02_emphasis/errors.text +9 -0
  327. data/test/testcases/span/02_emphasis/nesting.html +38 -0
  328. data/test/testcases/span/02_emphasis/nesting.text +33 -0
  329. data/test/testcases/span/02_emphasis/normal.html +46 -0
  330. data/test/testcases/span/02_emphasis/normal.text +46 -0
  331. data/test/testcases/span/03_codespan/empty.html +5 -0
  332. data/test/testcases/span/03_codespan/empty.text +5 -0
  333. data/test/testcases/span/03_codespan/errors.html +1 -0
  334. data/test/testcases/span/03_codespan/errors.text +1 -0
  335. data/test/testcases/span/03_codespan/highlighting.html +1 -0
  336. data/test/testcases/span/03_codespan/highlighting.text +1 -0
  337. data/test/testcases/span/03_codespan/normal.html +16 -0
  338. data/test/testcases/span/03_codespan/normal.text +16 -0
  339. data/test/testcases/span/04_footnote/definitions.html +17 -0
  340. data/test/testcases/span/04_footnote/definitions.latex +17 -0
  341. data/test/testcases/span/04_footnote/definitions.text +24 -0
  342. data/test/testcases/span/04_footnote/footnote_nr.html +12 -0
  343. data/test/testcases/span/04_footnote/footnote_nr.latex +2 -0
  344. data/test/testcases/span/04_footnote/footnote_nr.options +1 -0
  345. data/test/testcases/span/04_footnote/footnote_nr.text +4 -0
  346. data/test/testcases/span/04_footnote/markers.html +46 -0
  347. data/test/testcases/span/04_footnote/markers.latex +23 -0
  348. data/test/testcases/span/04_footnote/markers.text +26 -0
  349. data/test/testcases/span/05_html/across_lines.html +1 -0
  350. data/test/testcases/span/05_html/across_lines.text +2 -0
  351. data/test/testcases/span/05_html/invalid.html +1 -0
  352. data/test/testcases/span/05_html/invalid.text +1 -0
  353. data/test/testcases/span/05_html/link_with_mailto.html +1 -0
  354. data/test/testcases/span/05_html/link_with_mailto.text +1 -0
  355. data/test/testcases/span/05_html/markdown_attr.html +6 -0
  356. data/test/testcases/span/05_html/markdown_attr.text +6 -0
  357. data/test/testcases/span/05_html/normal.html +30 -0
  358. data/test/testcases/span/05_html/normal.text +30 -0
  359. data/test/testcases/span/abbreviations/abbrev.html +8 -0
  360. data/test/testcases/span/abbreviations/abbrev.text +15 -0
  361. data/test/testcases/span/abbreviations/abbrev_defs.html +2 -0
  362. data/test/testcases/span/abbreviations/abbrev_defs.text +5 -0
  363. data/test/testcases/span/autolinks/url_links.html +12 -0
  364. data/test/testcases/span/autolinks/url_links.text +12 -0
  365. data/test/testcases/span/escaped_chars/normal.html +43 -0
  366. data/test/testcases/span/escaped_chars/normal.text +43 -0
  367. data/test/testcases/span/extension/comment.html +6 -0
  368. data/test/testcases/span/extension/comment.text +6 -0
  369. data/test/testcases/span/extension/ignored.html +1 -0
  370. data/test/testcases/span/extension/ignored.text +1 -0
  371. data/test/testcases/span/extension/nomarkdown.html +1 -0
  372. data/test/testcases/span/extension/nomarkdown.text +1 -0
  373. data/test/testcases/span/extension/options.html +1 -0
  374. data/test/testcases/span/extension/options.text +1 -0
  375. data/test/testcases/span/ial/simple.html +6 -0
  376. data/test/testcases/span/ial/simple.text +6 -0
  377. data/test/testcases/span/line_breaks/normal.html +11 -0
  378. data/test/testcases/span/line_breaks/normal.latex +12 -0
  379. data/test/testcases/span/line_breaks/normal.text +11 -0
  380. data/test/testcases/span/math/normal.html +5 -0
  381. data/test/testcases/span/math/normal.text +5 -0
  382. data/test/testcases/span/text_substitutions/entities.html +4 -0
  383. data/test/testcases/span/text_substitutions/entities.options +1 -0
  384. data/test/testcases/span/text_substitutions/entities.text +4 -0
  385. data/test/testcases/span/text_substitutions/entities_as_char.html +1 -0
  386. data/test/testcases/span/text_substitutions/entities_as_char.html.19 +1 -0
  387. data/test/testcases/span/text_substitutions/entities_as_char.options +1 -0
  388. data/test/testcases/span/text_substitutions/entities_as_char.text +1 -0
  389. data/test/testcases/span/text_substitutions/entities_as_input.html +1 -0
  390. data/test/testcases/span/text_substitutions/entities_as_input.options +1 -0
  391. data/test/testcases/span/text_substitutions/entities_as_input.text +1 -0
  392. data/test/testcases/span/text_substitutions/entities_numeric.html +1 -0
  393. data/test/testcases/span/text_substitutions/entities_numeric.options +1 -0
  394. data/test/testcases/span/text_substitutions/entities_numeric.text +1 -0
  395. data/test/testcases/span/text_substitutions/entities_symbolic.html +1 -0
  396. data/test/testcases/span/text_substitutions/entities_symbolic.options +1 -0
  397. data/test/testcases/span/text_substitutions/entities_symbolic.text +1 -0
  398. data/test/testcases/span/text_substitutions/greaterthan.html +1 -0
  399. data/test/testcases/span/text_substitutions/greaterthan.text +1 -0
  400. data/test/testcases/span/text_substitutions/lowerthan.html +1 -0
  401. data/test/testcases/span/text_substitutions/lowerthan.text +1 -0
  402. data/test/testcases/span/text_substitutions/typography.html +18 -0
  403. data/test/testcases/span/text_substitutions/typography.html.19 +18 -0
  404. data/test/testcases/span/text_substitutions/typography.text +18 -0
  405. metadata +476 -0
@@ -0,0 +1,39 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ module Newstile
24
+
25
+ # == Parser Module
26
+ #
27
+ # This module contains all available parsers. Currently, there two parsers:
28
+ #
29
+ # * Newstile for parsing documents in newstile format
30
+ # * Html for parsing HTML documents
31
+ module Parser
32
+
33
+ autoload :Base, 'newstile/parser/base'
34
+ autoload :Newstile, 'newstile/parser/newstile'
35
+ autoload :Html, 'newstile/parser/html'
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,94 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ module Newstile
24
+
25
+ module Parser
26
+
27
+ # == Base class for parsers
28
+ #
29
+ # This class serves as base class for parsers. It provides common methods that can/should be
30
+ # used by all parsers, especially by those using StringScanner for parsing.
31
+ #
32
+ class Base
33
+
34
+ # Initialize the parser with the given Newstile document +doc+.
35
+ def initialize(doc)
36
+ @doc = doc
37
+ @text_type = :text
38
+ end
39
+ private_class_method(:new, :allocate)
40
+
41
+ # Parse the +source+ string into an element tree, using the information provided by the
42
+ # Newstile document +doc+.
43
+ #
44
+ # Initializes a new instance of the calling class and then calls the #parse method that must
45
+ # be implemented by each subclass.
46
+ def self.parse(source, doc)
47
+ new(doc).parse(source)
48
+ end
49
+
50
+
51
+ # Add the given warning +text+ to the warning array of the Newstile document.
52
+ def warning(text)
53
+ @doc.warnings << text
54
+ #TODO: add position information
55
+ end
56
+
57
+ # Modify the string +source+ to be usable by the parser.
58
+ def adapt_source(source)
59
+ source.gsub(/\r\n?/, "\n").chomp + "\n"
60
+ end
61
+
62
+ # This helper method adds the given +text+ either to the last element in the +tree+ if it is a
63
+ # +type+ element or creates a new text element with the given +type+.
64
+ def add_text(text, tree = @tree, type = @text_type)
65
+ if tree.children.last && tree.children.last.type == type
66
+ tree.children.last.value << text
67
+ elsif !text.empty?
68
+ tree.children << Element.new(type, text)
69
+ end
70
+ end
71
+
72
+ # Extract the part of the StringScanner +srcscan+ backed string specified by the +range+. This
73
+ # method also works correctly under Ruby 1.9.
74
+ def extract_string(range, strscan)
75
+ result = nil
76
+ if RUBY_VERSION >= '1.9'
77
+ begin
78
+ enc = strscan.string.encoding
79
+ strscan.string.force_encoding('ASCII-8BIT')
80
+ result = strscan.string[range].force_encoding(enc)
81
+ ensure
82
+ strscan.string.force_encoding(enc)
83
+ end
84
+ else
85
+ result = strscan.string[range]
86
+ end
87
+ result
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,499 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of newstile.
7
+ #
8
+ # newstile is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'rexml/parsers/baseparser'
24
+ require 'strscan'
25
+
26
+ module Newstile
27
+
28
+ module Parser
29
+
30
+ # Used for parsing a HTML document.
31
+ class Html < Base
32
+
33
+ # Contains all constants that are used when parsing.
34
+ module Constants
35
+ #:stopdoc:
36
+ # The following regexps are based on the ones used by REXML, with some slight modifications.
37
+ HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
38
+ HTML_COMMENT_RE = /<!--(.*?)-->/m
39
+ HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
40
+ HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/m
41
+ HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/m
42
+ HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::UNAME_STR})\s*>/m
43
+ HTML_ENTITY_RE = /&([\w:][\-\w\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
44
+
45
+
46
+ HTML_PARSE_AS_BLOCK = %w{applet button blockquote body colgroup dd div dl fieldset form iframe li
47
+ map noscript object ol table tbody thead tfoot tr td ul}
48
+ HTML_PARSE_AS_SPAN = %w{a abbr acronym address b bdo big cite caption del dfn dt em
49
+ h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
50
+ rp rt rtc ruby samp select small span strong sub sup th tt var}
51
+ HTML_PARSE_AS_RAW = %w{script math option textarea pre code}
52
+
53
+ HTML_PARSE_AS = Hash.new {|h,k| h[k] = :raw}
54
+ HTML_PARSE_AS_BLOCK.each {|i| HTML_PARSE_AS[i] = :block}
55
+ HTML_PARSE_AS_SPAN.each {|i| HTML_PARSE_AS[i] = :span}
56
+ HTML_PARSE_AS_RAW.each {|i| HTML_PARSE_AS[i] = :raw}
57
+
58
+ # Some HTML elements like script belong to both categories (i.e. are valid in block and
59
+ # span HTML) and don't appear therefore!
60
+ HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
61
+ ins kbd label option q rb rbc rp rt rtc ruby samp select small span
62
+ strong sub sup textarea tt var}
63
+ HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
64
+ figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend listing menu
65
+ li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
66
+ HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
67
+ end
68
+
69
+
70
+ # Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
71
+ # functionality. The only thing that must be provided by the class are instance variable
72
+ # <tt>@stack</tt> for storing needed state and <tt>@src</tt> (instance of StringScanner) for
73
+ # the actual parsing.
74
+ module Parser
75
+
76
+ include Constants
77
+
78
+ # Process the HTML start tag that has already be scanned/checked. Does the common processing
79
+ # steps and then yields to the caller for further processing.
80
+ def handle_html_start_tag
81
+ name = @src[1]
82
+ closed = !@src[4].nil?
83
+ attrs = Utils::OrderedHash.new
84
+ @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr] = val}
85
+
86
+ el = Element.new(:html_element, name, attrs, :category => :block)
87
+ @tree.children << el
88
+
89
+ if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
90
+ warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
91
+ closed = true
92
+ end
93
+ if name == 'script'
94
+ handle_html_script_tag
95
+ yield(el, true)
96
+ else
97
+ yield(el, closed)
98
+ end
99
+ end
100
+
101
+ def handle_html_script_tag
102
+ curpos = @src.pos
103
+ if result = @src.scan_until(/(?=<\/script\s*>)/m)
104
+ add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
105
+ @src.scan(HTML_TAG_CLOSE_RE)
106
+ else
107
+ add_text(@src.scan(/.*/m), @tree.children.last, :raw)
108
+ warning("Found no end tag for 'script' - auto-closing it")
109
+ end
110
+ end
111
+
112
+ HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/
113
+
114
+ # Parse raw HTML from the current source position, storing the found elements in +el+.
115
+ # Parsing continues until one of the following criteria are fulfilled:
116
+ #
117
+ # - The end of the document is reached.
118
+ # - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
119
+ # element).
120
+ #
121
+ # When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
122
+ # providing the block given to this method.
123
+ def parse_raw_html(el, &block)
124
+ @stack.push(@tree)
125
+ @tree = el
126
+
127
+ done = false
128
+ while !@src.eos? && !done
129
+ if result = @src.scan_until(HTML_RAW_START)
130
+ add_text(result, @tree, :text)
131
+ if result = @src.scan(HTML_COMMENT_RE)
132
+ @tree.children << Element.new(:xml_comment, result, nil, :category => :block)
133
+ elsif result = @src.scan(HTML_INSTRUCTION_RE)
134
+ @tree.children << Element.new(:xml_pi, result, nil, :category => :block)
135
+ elsif @src.scan(HTML_TAG_RE)
136
+ handle_html_start_tag(&block)
137
+ elsif @src.scan(HTML_TAG_CLOSE_RE)
138
+ if @tree.value == @src[1]
139
+ done = true
140
+ else
141
+ warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")
142
+ end
143
+ else
144
+ add_text(@src.scan(/./), @tree, :text)
145
+ end
146
+ else
147
+ result = @src.scan(/.*/m)
148
+ add_text(result, @tree, :text)
149
+ warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
150
+ done = true
151
+ end
152
+ end
153
+
154
+ @tree = @stack.pop
155
+ end
156
+
157
+ end
158
+
159
+
160
+ # Converts HTML elements to native elements if possible.
161
+ class ElementConverter
162
+
163
+ include Constants
164
+ include ::Newstile::Utils::Entities
165
+
166
+ REMOVE_TEXT_CHILDREN = %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
167
+ WRAP_TEXT_CHILDREN = %w{body section nav article aside header footer address div li dd blockquote figure
168
+ figcaption fieldset form}
169
+ REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
170
+ div li dd blockquote figure figcaption td th fieldset form}
171
+ STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
172
+ header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
173
+ SIMPLE_ELEMENTS = %w{em strong blockquote hr br img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
174
+
175
+ def initialize(doc)
176
+ @doc = doc
177
+ end
178
+
179
+ # Convert the element +el+ and its children.
180
+ def process(el, do_conversion = true, preserve_text = false, parent = nil)
181
+ case el.type
182
+ when :xml_comment, :xml_pi, :html_doctype
183
+ ptype = if parent.nil?
184
+ 'div'
185
+ else
186
+ case parent.type
187
+ when :html_element then parent.value
188
+ when :code_span then 'code'
189
+ when :code_block then 'pre'
190
+ when :header then 'h1'
191
+ else parent.type.to_s
192
+ end
193
+ end
194
+ el.options = {:category => HTML_PARSE_AS_SPAN.include?(ptype) ? :span : :block}
195
+ return
196
+ when :html_element
197
+ else return
198
+ end
199
+
200
+ type = el.value
201
+ remove_text_children(el) if REMOVE_TEXT_CHILDREN.include?(type)
202
+
203
+ mname = "convert_#{el.value}"
204
+ if do_conversion && self.class.method_defined?(mname)
205
+ send(mname, el)
206
+ elsif do_conversion && SIMPLE_ELEMENTS.include?(type)
207
+ set_basics(el, type.intern, HTML_SPAN_ELEMENTS.include?(type) ? :span : :block)
208
+ process_children(el, do_conversion, preserve_text)
209
+ else
210
+ process_html_element(el, do_conversion, preserve_text)
211
+ end
212
+
213
+ strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
214
+ remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
215
+ wrap_text_children(el) if WRAP_TEXT_CHILDREN.include?(type)
216
+ end
217
+
218
+ def process_children(el, do_conversion = true, preserve_text = false)
219
+ el.children.map! do |c|
220
+ if c.type == :text
221
+ process_text(c.value, preserve_text)
222
+ else
223
+ process(c, do_conversion, preserve_text, el)
224
+ c
225
+ end
226
+ end.flatten!
227
+ end
228
+
229
+ # Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
230
+ # entities in entity elements.
231
+ def process_text(raw, preserve = false)
232
+ raw.gsub!(/\s+/, ' ') unless preserve
233
+ src = StringScanner.new(raw)
234
+ result = []
235
+ while !src.eos?
236
+ if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
237
+ result << Element.new(:text, tmp)
238
+ src.scan(HTML_ENTITY_RE)
239
+ val = src[1] || (src[2] && src[2].to_i) || src[3].hex
240
+ result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
241
+ Element.new(:smart_quote, val.intern)
242
+ elsif %w{mdash ndash hellip laquo raquo}.include?(val)
243
+ Element.new(:typographic_sym, val.intern)
244
+ else
245
+ Element.new(:entity, entity(val), nil, :original => src.matched)
246
+ end
247
+ else
248
+ result << Element.new(:text, src.scan(/.*/m))
249
+ end
250
+ end
251
+ result
252
+ end
253
+
254
+ def process_html_element(el, do_conversion = true, preserve_text = false)
255
+ el.options = {:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
256
+ :parse_type => HTML_PARSE_AS[el.value]
257
+ }
258
+ process_children(el, do_conversion, preserve_text)
259
+ end
260
+
261
+ def remove_text_children(el)
262
+ el.children.delete_if {|c| c.type == :text}
263
+ end
264
+
265
+ def wrap_text_children(el)
266
+ tmp = []
267
+ last_is_p = false
268
+ el.children.each do |c|
269
+ if c.options[:category] != :block || c.type == :text
270
+ if !last_is_p
271
+ tmp << Element.new(:p, nil, nil, :transparent => true)
272
+ last_is_p = true
273
+ end
274
+ tmp.last.children << c
275
+ tmp
276
+ else
277
+ tmp << c
278
+ last_is_p = false
279
+ end
280
+ end
281
+ el.children = tmp
282
+ end
283
+
284
+ def strip_whitespace(el)
285
+ return if el.children.empty?
286
+ if el.children.first.type == :text
287
+ el.children.first.value.lstrip!
288
+ end
289
+ if el.children.last.type == :text
290
+ el.children.last.value.rstrip!
291
+ end
292
+ end
293
+
294
+ def remove_whitespace_children(el)
295
+ i = -1
296
+ el.children.delete_if do |c|
297
+ i += 1
298
+ c.type == :text && c.value.strip.empty? &&
299
+ (i == 0 || i == el.children.length - 1 || (el.children[i-1].options[:category] == :block &&
300
+ el.children[i+1].options[:category] == :block))
301
+ end
302
+ end
303
+
304
+ def set_basics(el, type, category, opts = {})
305
+ el.type = type
306
+ el.options = {:category => category}.merge(opts)
307
+ el.value = nil
308
+ end
309
+
310
+ def extract_text(el, raw)
311
+ raw << el.value.to_s if el.type == :text
312
+ el.children.each {|c| extract_text(c, raw)}
313
+ end
314
+
315
+ def convert_a(el)
316
+ if el.attr['href']
317
+ set_basics(el, :a, :span)
318
+ process_children(el)
319
+ else
320
+ process_html_element(el, false)
321
+ end
322
+ end
323
+
324
+ def convert_b(el)
325
+ set_basics(el, :strong, :span)
326
+ process_children(el)
327
+ end
328
+
329
+ def convert_i(el)
330
+ set_basics(el, :em, :span)
331
+ process_children(el)
332
+ end
333
+
334
+ def convert_h1(el)
335
+ set_basics(el, :header, :block, :level => el.value[1..1].to_i)
336
+ extract_text(el, el.options[:raw_text] = '')
337
+ process_children(el)
338
+ end
339
+ %w{h2 h3 h4 h5 h6}.each do |i|
340
+ alias_method("convert_#{i}".to_sym, :convert_h1)
341
+ end
342
+
343
+ def convert_code(el)
344
+ raw = ''
345
+ extract_text(el, raw)
346
+ result = process_text(raw, true)
347
+ begin
348
+ str = result.inject('') do |mem, c|
349
+ if c.type == :text
350
+ mem << c.value
351
+ elsif c.type == :entity
352
+ if RUBY_VERSION >= '1.9'
353
+ mem << c.value.char.encode(@doc.parse_infos[:encoding])
354
+ elsif [60, 62, 34, 38].include?(c.value.code_point)
355
+ mem << c.value.code_point.chr
356
+ end
357
+ elsif c.type == :smart_quote || c.type == :typographic_sym
358
+ mem << entity(c.value.to_s).char.encode(@doc.parse_infos[:encoding])
359
+ else
360
+ raise "Bug - please report"
361
+ end
362
+ end
363
+ result.clear
364
+ result << Element.new(:text, str)
365
+ rescue
366
+ end
367
+ if result.length > 1 || result.first.type != :text
368
+ process_html_element(el, false, true)
369
+ else
370
+ if el.value == 'code'
371
+ set_basics(el, :codespan, :span)
372
+ else
373
+ set_basics(el, :codeblock, :block)
374
+ end
375
+ el.value = result.first.value
376
+ el.children.clear
377
+ end
378
+ end
379
+ alias :convert_pre :convert_code
380
+
381
+ def convert_table(el)
382
+ if !is_simple_table?(el)
383
+ process_html_element(el, false)
384
+ return
385
+ end
386
+ process_children(el)
387
+ set_basics(el, :table, :block)
388
+ el.options[:alignment] = []
389
+ calc_alignment = lambda do |c|
390
+ if c.type == :tr && el.options[:alignment].empty?
391
+ el.options[:alignment] = [:default] * c.children.length
392
+ break
393
+ else
394
+ c.children.each {|cc| calc_alignment.call(cc)}
395
+ end
396
+ end
397
+ calc_alignment.call(el)
398
+ if el.children.first.type == :tr
399
+ tbody = Element.new(:tbody, nil, nil, :category => :block)
400
+ tbody.children = el.children
401
+ el.children = [tbody]
402
+ end
403
+ end
404
+
405
+ def is_simple_table?(el)
406
+ only_phrasing_content = lambda do |c|
407
+ c.children.all? do |cc|
408
+ (cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
409
+ end
410
+ end
411
+ check_cells = Proc.new do |c|
412
+ if c.value == 'th' || c.value == 'td'
413
+ return false if !only_phrasing_content.call(c)
414
+ else
415
+ c.children.each {|cc| check_cells.call(cc)}
416
+ end
417
+ end
418
+ check_cells.call(el)
419
+
420
+ check_rows = lambda do |t, type|
421
+ t.children.all? {|r| (r.value == 'tr' || r.type == :text) && r.children.all? {|c| c.value == type || c.type == :text}}
422
+ end
423
+ check_rows.call(el, 'td') ||
424
+ (el.children.all? do |t|
425
+ t.type == :text || (t.value == 'thead' && check_rows.call(t, 'th')) ||
426
+ ((t.value == 'tfoot' || t.value == 'tbody') && check_rows.call(t, 'td'))
427
+ end && el.children.any? {|t| t.value == 'tbody'})
428
+ end
429
+
430
+ def convert_div(el)
431
+ if !is_math_tag?(el)
432
+ process_html_element(el)
433
+ else
434
+ handle_math_tag(el)
435
+ end
436
+ end
437
+ alias :convert_span :convert_div
438
+
439
+ def is_math_tag?(el)
440
+ el.attr['class'].to_s =~ /\bmath\b/ &&
441
+ el.children.size == 1 && el.children.first.type == :text
442
+ end
443
+
444
+ def handle_math_tag(el)
445
+ set_basics(el, :math, (el.value == 'div' ? :block : :span))
446
+ el.value = el.children.shift.value
447
+ if el.attr['class'] =~ /^\s*math\s*$/
448
+ el.attr.delete('class')
449
+ else
450
+ el.attr['class'].sub!(/\s?math/, '')
451
+ end
452
+ el.value.gsub!(/&(amp|quot|gt|lt);/) do |m|
453
+ case m
454
+ when '&amp;' then '&'
455
+ when '&quot;' then '"'
456
+ when '&gt;' then '>'
457
+ when '&lt;' then '<'
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ include Parser
464
+
465
+ # Parse +source+ as HTML document and return the created +tree+.
466
+ def parse(source)
467
+ @stack = []
468
+ @tree = Element.new(:root)
469
+ @src = StringScanner.new(adapt_source(source))
470
+
471
+ while true
472
+ if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
473
+ @tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
474
+ elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
475
+ @tree.children << Element.new(:html_doctype, result.strip, nil, :category => :block)
476
+ elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
477
+ @tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
478
+ else
479
+ break
480
+ end
481
+ end
482
+
483
+ tag_handler = lambda do |c, closed|
484
+ parse_raw_html(c, &tag_handler) if !closed
485
+ end
486
+ parse_raw_html(@tree, &tag_handler)
487
+
488
+ ec = ElementConverter.new(@doc)
489
+ @tree.children.each {|c| ec.process(c)}
490
+ ec.remove_whitespace_children(@tree)
491
+ @tree
492
+ end
493
+
494
+ end
495
+
496
+ end
497
+
498
+ end
499
+