feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -1,4 +1,29 @@
1
1
  require 'rexml/document'
2
+ require 'yaml'
3
+
4
+ module YAML
5
+ def YAML.dump( obj, io = nil )
6
+ if obj.kind_of?(FeedTools::Feed) || obj.kind_of?(FeedTools::FeedItem)
7
+ # Dangit, you WILL NOT serialize these things.
8
+ obj.instance_variable_set("@xml_document", nil)
9
+ obj.instance_variable_set("@root_node", nil)
10
+ obj.instance_variable_set("@channel_node", nil)
11
+ end
12
+ obj.to_yaml( io || io2 = StringIO.new )
13
+ io || ( io2.rewind; io2.read )
14
+ end
15
+
16
+ def YAML.load( io )
17
+ yp = parser.load( io )
18
+ if yp.kind_of?(FeedTools::Feed) || yp.kind_of?(FeedTools::FeedItem)
19
+ # No really, I'm serious, you WILL NOT deserialize these things.
20
+ yp.instance_variable_set("@xml_document", nil)
21
+ yp.instance_variable_set("@root_node", nil)
22
+ yp.instance_variable_set("@channel_node", nil)
23
+ end
24
+ yp
25
+ end
26
+ end
2
27
 
3
28
  module REXML # :nodoc:
4
29
  class LiberalXPathParser < XPathParser # :nodoc:
@@ -640,4 +665,5 @@ module REXML # :nodoc:
640
665
  end
641
666
  end
642
667
  end
643
- end
668
+ end
669
+
@@ -0,0 +1,10 @@
1
+ == 0.10.0 2007-10-08
2
+ * proof-of-concept validator
3
+ * easier to localize error reporting
4
+ * many unit tests
5
+
6
+ == 0.1.0 / 2007-08-07
7
+
8
+ * 1 major enhancement
9
+ * Birthday!
10
+
@@ -0,0 +1,117 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README
4
+ Rakefile.rb
5
+ bin/html5
6
+ lib/core_ext/string.rb
7
+ lib/html5.rb
8
+ lib/html5/constants.rb
9
+ lib/html5/filters/base.rb
10
+ lib/html5/filters/inject_meta_charset.rb
11
+ lib/html5/filters/iso639codes.rb
12
+ lib/html5/filters/optionaltags.rb
13
+ lib/html5/filters/rfc2046.rb
14
+ lib/html5/filters/rfc3987.rb
15
+ lib/html5/filters/sanitizer.rb
16
+ lib/html5/filters/validator.rb
17
+ lib/html5/filters/whitespace.rb
18
+ lib/html5/html5parser.rb
19
+ lib/html5/html5parser/after_body_phase.rb
20
+ lib/html5/html5parser/after_frameset_phase.rb
21
+ lib/html5/html5parser/after_head_phase.rb
22
+ lib/html5/html5parser/before_head_phase.rb
23
+ lib/html5/html5parser/in_body_phase.rb
24
+ lib/html5/html5parser/in_caption_phase.rb
25
+ lib/html5/html5parser/in_cell_phase.rb
26
+ lib/html5/html5parser/in_column_group_phase.rb
27
+ lib/html5/html5parser/in_frameset_phase.rb
28
+ lib/html5/html5parser/in_head_phase.rb
29
+ lib/html5/html5parser/in_row_phase.rb
30
+ lib/html5/html5parser/in_select_phase.rb
31
+ lib/html5/html5parser/in_table_body_phase.rb
32
+ lib/html5/html5parser/in_table_phase.rb
33
+ lib/html5/html5parser/initial_phase.rb
34
+ lib/html5/html5parser/phase.rb
35
+ lib/html5/html5parser/root_element_phase.rb
36
+ lib/html5/html5parser/trailing_end_phase.rb
37
+ lib/html5/inputstream.rb
38
+ lib/html5/liberalxmlparser.rb
39
+ lib/html5/sanitizer.rb
40
+ lib/html5/serializer.rb
41
+ lib/html5/serializer/htmlserializer.rb
42
+ lib/html5/serializer/xhtmlserializer.rb
43
+ lib/html5/sniffer.rb
44
+ lib/html5/tokenizer.rb
45
+ lib/html5/treebuilders.rb
46
+ lib/html5/treebuilders/base.rb
47
+ lib/html5/treebuilders/hpricot.rb
48
+ lib/html5/treebuilders/rexml.rb
49
+ lib/html5/treebuilders/simpletree.rb
50
+ lib/html5/treewalkers.rb
51
+ lib/html5/treewalkers/base.rb
52
+ lib/html5/treewalkers/hpricot.rb
53
+ lib/html5/treewalkers/rexml.rb
54
+ lib/html5/treewalkers/simpletree.rb
55
+ lib/html5/version.rb
56
+ testdata/encoding/chardet/test_big5.txt
57
+ testdata/encoding/test-yahoo-jp.dat
58
+ testdata/encoding/tests1.dat
59
+ testdata/encoding/tests2.dat
60
+ testdata/sanitizer/tests1.dat
61
+ testdata/serializer/core.test
62
+ testdata/serializer/injectmeta.test
63
+ testdata/serializer/optionaltags.test
64
+ testdata/serializer/options.test
65
+ testdata/serializer/whitespace.test
66
+ testdata/sites/google-results.htm
67
+ testdata/sites/python-ref-import.htm
68
+ testdata/sites/web-apps-old.htm
69
+ testdata/sites/web-apps.htm
70
+ testdata/sniffer/htmlOrFeed.json
71
+ testdata/tokenizer/contentModelFlags.test
72
+ testdata/tokenizer/entities.test
73
+ testdata/tokenizer/escapeFlag.test
74
+ testdata/tokenizer/test1.test
75
+ testdata/tokenizer/test2.test
76
+ testdata/tokenizer/test3.test
77
+ testdata/tokenizer/test4.test
78
+ testdata/tree-construction/tests1.dat
79
+ testdata/tree-construction/tests2.dat
80
+ testdata/tree-construction/tests3.dat
81
+ testdata/tree-construction/tests4.dat
82
+ testdata/tree-construction/tests5.dat
83
+ testdata/tree-construction/tests6.dat
84
+ testdata/validator/attributes.test
85
+ testdata/validator/base-href-attribute.test
86
+ testdata/validator/base-target-attribute.test
87
+ testdata/validator/blockquote-cite-attribute.test
88
+ testdata/validator/classattribute.test
89
+ testdata/validator/contenteditableattribute.test
90
+ testdata/validator/contextmenuattribute.test
91
+ testdata/validator/dirattribute.test
92
+ testdata/validator/draggableattribute.test
93
+ testdata/validator/html-xmlns-attribute.test
94
+ testdata/validator/idattribute.test
95
+ testdata/validator/inputattributes.test
96
+ testdata/validator/irrelevantattribute.test
97
+ testdata/validator/langattribute.test
98
+ testdata/validator/li-value-attribute.test
99
+ testdata/validator/link-href-attribute.test
100
+ testdata/validator/link-hreflang-attribute.test
101
+ testdata/validator/link-rel-attribute.test
102
+ testdata/validator/ol-start-attribute.test
103
+ testdata/validator/starttags.test
104
+ testdata/validator/style-scoped-attribute.test
105
+ testdata/validator/tabindexattribute.test
106
+ tests/preamble.rb
107
+ tests/test_encoding.rb
108
+ tests/test_lxp.rb
109
+ tests/test_parser.rb
110
+ tests/test_sanitizer.rb
111
+ tests/test_serializer.rb
112
+ tests/test_sniffer.rb
113
+ tests/test_stream.rb
114
+ tests/test_tokenizer.rb
115
+ tests/test_treewalkers.rb
116
+ tests/test_validator.rb
117
+ tests/tokenizer_test_parser.rb
@@ -0,0 +1,45 @@
1
+ html5
2
+ by Ryan King, et al
3
+ http://code.google.com/p/html5lib
4
+
5
+ == DESCRIPTION:
6
+
7
+ A ruby implementation of the parsing algorithm in HTML5.
8
+
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+
13
+
14
+ == SYNOPSIS:
15
+
16
+ TODO
17
+
18
+ == REQUIREMENTS:
19
+
20
+ * chardet, only tested with 0.9.0
21
+
22
+ == INSTALL:
23
+
24
+ * sudo gem install html5
25
+
26
+ == LICENSE:
27
+
28
+ Copyright (c) 2006-2007 The Authors
29
+
30
+ Contributers:
31
+ James Graham - jg307@cam.ac.uk
32
+ Anne van Kesteren - annevankesteren@gmail.com
33
+ Lachlan Hunt - lachlan.hunt@lachy.id.au
34
+ Matt McDonald - kanashii@kanashii.ca
35
+ Sam Ruby - rubys@intertwingly.net
36
+ Ian Hickson (Google) - ian@hixie.ch
37
+ Thomas Broyer - t.broyer@ltgt.net
38
+ Jacques Distler - distler@golem.ph.utexas.edu
39
+ Ryan King - ryan@theryanking.com
40
+
41
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
42
+
43
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
44
+
45
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,33 @@
1
+ require 'rake'
2
+ require 'hoe'
3
+ require 'lib/html5/version'
4
+
5
+ Hoe.new("html5", HTML5::VERSION) do |p|
6
+ p.name = "html5"
7
+ p.description = p.paragraphs_of('README', 2..5).join("\n\n")
8
+ p.summary = "HTML5 parser/tokenizer."
9
+
10
+ p.author = ['Ryan King'] # TODO: add more names
11
+ p.email = 'ryan@theryanking.com'
12
+ p.url = 'http://code.google.com/p/html5lib'
13
+ p.need_zip = true
14
+
15
+ p.extra_deps << ['chardet', '>= 0.9.0']
16
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
17
+ end
18
+
19
+ require 'rcov/rcovtask'
20
+
21
+ namespace :test do
22
+ namespace :coverage do
23
+ desc "Delete aggregate coverage data."
24
+ task(:clean) { rm_f "coverage.data" }
25
+ end
26
+ desc 'Aggregate code coverage for unit, functional and integration tests'
27
+ Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t|
28
+ t.libs << "tests"
29
+ t.test_files = FileList["tests/test_*.rb"]
30
+ t.output_dir = "tests/coverage/"
31
+ t.verbose = true
32
+ end
33
+ end
@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'core_ext/string'
4
+ $:.unshift File.dirname(__FILE__), 'lib'
5
+
6
+ def parse(opts, args)
7
+ encoding = nil
8
+
9
+ f = args[-1]
10
+ if f
11
+ begin
12
+ if f[0..6] == 'http://'
13
+ require 'open-uri'
14
+ f = URI.parse(f).open
15
+ encoding = f.charset
16
+ elsif f == '-'
17
+ f = $stdin
18
+ else
19
+ f = open(f)
20
+ end
21
+ rescue
22
+ end
23
+ else
24
+ $stderr.write("No filename provided. Use -h for help\n")
25
+ exit(1)
26
+ end
27
+
28
+ require 'html5/treebuilders'
29
+ treebuilder = HTML5::TreeBuilders[opts.treebuilder]
30
+
31
+ if opts.output == :xml
32
+ require 'html5/liberalxmlparser'
33
+ p = HTML5::XMLParser.new(:tree=>treebuilder)
34
+ else
35
+ require 'html5/html5parser'
36
+ p = HTML5::HTMLParser.new(:tree=>treebuilder)
37
+ end
38
+
39
+ if opts.parsemethod == :parse
40
+ args = [f, encoding]
41
+ else
42
+ args = [f, (opts.container || 'div'), encoding]
43
+ end
44
+
45
+ if opts.profile
46
+ require 'profiler'
47
+ Profiler__::start_profile
48
+ p.send(opts.parsemethod, *args)
49
+ Profiler__::stop_profile
50
+ Profiler__::print_profile($stderr)
51
+ elsif opts.time
52
+ require 'time' # TODO: switch to benchmark
53
+ t0 = Time.new
54
+ document = p.send(opts.parsemethod, *args)
55
+ t1 = Time.new
56
+ print_output(p, document, opts)
57
+ t2 = Time.new
58
+ puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
59
+ else
60
+ document = p.send(opts.parsemethod, *args)
61
+ print_output(p, document, opts)
62
+ end
63
+ end
64
+
65
+ def print_output(parser, document, opts)
66
+ puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
67
+
68
+ case opts.output
69
+ when :xml
70
+ print document
71
+ when :html
72
+ require 'html5/treewalkers'
73
+ tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
74
+ require 'html5/serializer'
75
+ puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
76
+ when :hilite
77
+ print document.hilite
78
+ when :tree
79
+ document = [document] unless document.respond_to?(:each)
80
+ document.each {|fragment| puts parser.tree.testSerializer(fragment)}
81
+ end
82
+
83
+ if opts.error
84
+ errList=[]
85
+ for pos, errorcode, datavars in parser.errors
86
+ errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars
87
+ end
88
+ $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
89
+ end
90
+ end
91
+
92
+ require 'ostruct'
93
+ options = OpenStruct.new
94
+ options.profile = false
95
+ options.time = false
96
+ options.output = :html
97
+ options.treebuilder = 'simpletree'
98
+ options.error = false
99
+ options.encoding = false
100
+ options.parsemethod = :parse
101
+ options.serializer = {
102
+ :encoding => 'utf-8',
103
+ :omit_optional_tags => false,
104
+ :inject_meta_charset => false
105
+ }
106
+
107
+ require 'optparse'
108
+ opts = OptionParser.new do |opts|
109
+ opts.separator ""
110
+ opts.separator "Parse Options:"
111
+
112
+ opts.on("-b", "--treebuilder NAME") do |treebuilder|
113
+ options.treebuilder = treebuilder
114
+ end
115
+
116
+ opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
117
+ options.parsemethod = :parse_fragment
118
+ options.container = container if container
119
+ end
120
+
121
+ opts.separator ""
122
+ opts.separator "Filter Options:"
123
+
124
+ opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
125
+ options.serializer[:inject_meta_charset] = inject
126
+ end
127
+
128
+ opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
129
+ options.serializer[:strip_whitespace] = strip
130
+ end
131
+
132
+ opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
133
+ options.serializer[:sanitize] = sanitize
134
+ end
135
+
136
+ opts.separator ""
137
+ opts.separator "Output Options:"
138
+
139
+ opts.on("--tree", "output as debug tree") do |tree|
140
+ options.output = :tree
141
+ end
142
+
143
+ opts.on("-x", "--xml", "output as xml") do |xml|
144
+ options.output = :xml
145
+ options.treebuilder = "rexml"
146
+ end
147
+
148
+ opts.on("--[no-]html", "Output as html") do |html|
149
+ options.output = (html ? :html : nil)
150
+ end
151
+
152
+ opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
153
+ options.output = :hilite
154
+ end
155
+
156
+ opts.on("-e", "--error", "Print a list of parse errors") do |error|
157
+ options.error = error
158
+ end
159
+
160
+ opts.separator ""
161
+ opts.separator "Serialization Options:"
162
+
163
+ opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
164
+ options.serializer[:omit_optional_tags] = omit
165
+ end
166
+
167
+ opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
168
+ options.serializer[:quote_attr_values] = quote
169
+ end
170
+
171
+ opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
172
+ options.serializer[:use_best_quote_char] = best
173
+ end
174
+
175
+ opts.on("--quote-char C", "Use specified quote character") do |c|
176
+ options.serializer[:quote_char] = c
177
+ end
178
+
179
+ opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
180
+ options.serializer[:minimize_boolean_attributes] = min
181
+ end
182
+
183
+ opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
184
+ options.serializer[:use_trailing_solidus] = slash
185
+ end
186
+
187
+ opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
188
+ options.serializer[:escape_lt_in_attrs] = lt
189
+ end
190
+
191
+ opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
192
+ options.serializer[:escape_rcdata] = rcdata
193
+ end
194
+
195
+ opts.separator ""
196
+ opts.separator "Other Options:"
197
+
198
+ opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
199
+ options.profile = profile
200
+ end
201
+
202
+ opts.on("-t", "--[no-]time", "Time the run") do |time|
203
+ options.time = time
204
+ end
205
+
206
+ opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
207
+ options.encoding = encoding
208
+ end
209
+
210
+ opts.on_tail("-h", "--help", "Show this message") do
211
+ puts opts
212
+ exit
213
+ end
214
+ end
215
+
216
+ opts.parse!(ARGV)
217
+ parse options, ARGV