feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -1,48 +0,0 @@
1
- # :stopdoc:
2
- module HTree
3
- class Name; include HTree end
4
- class Context; include HTree end
5
-
6
- module Tag; include HTree end
7
- class STag; include Tag end
8
- class ETag; include Tag end
9
-
10
- module Node; include HTree end
11
- module Container; include Node end
12
- class Doc; include Container end
13
- class Elem; include Container end
14
- module Leaf; include Node end
15
- class Text; include Leaf end
16
- class XMLDecl; include Leaf end
17
- class DocType; include Leaf end
18
- class ProcIns; include Leaf end
19
- class Comment; include Leaf end
20
- class BogusETag; include Leaf end
21
-
22
- module Traverse end
23
- module Container::Trav; include Traverse end
24
- module Leaf::Trav; include Traverse end
25
- class Doc; module Trav; include Container::Trav end; include Trav end
26
- class Elem; module Trav; include Container::Trav end; include Trav end
27
- class Text; module Trav; include Leaf::Trav end; include Trav end
28
- class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
29
- class DocType; module Trav; include Leaf::Trav end; include Trav end
30
- class ProcIns; module Trav; include Leaf::Trav end; include Trav end
31
- class Comment; module Trav; include Leaf::Trav end; include Trav end
32
- class BogusETag; module Trav; include Leaf::Trav end; include Trav end
33
-
34
- class Location; include HTree end
35
- module Container::Loc end
36
- module Leaf::Loc end
37
- class Doc; class Loc < Location; include Trav, Container::Loc end end
38
- class Elem; class Loc < Location; include Trav, Container::Loc end end
39
- class Text; class Loc < Location; include Trav, Leaf::Loc end end
40
- class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
41
- class DocType; class Loc < Location; include Trav, Leaf::Loc end end
42
- class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
43
- class Comment; class Loc < Location; include Trav, Leaf::Loc end end
44
- class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
45
-
46
- class Error < StandardError; end
47
- end
48
- # :startdoc:
@@ -1,124 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/scan' # for Pat::Nmtoken
3
- require 'htree/context'
4
-
5
- module HTree # :nodoc:
6
- # Name represents a element name and attribute name.
7
- # It consists of a namespace prefix, a namespace URI and a local name.
8
- class Name # :nodoc:
9
- =begin
10
- element name prefix uri localname
11
- {u}n, n with xmlns=u nil 'u' 'n'
12
- p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
13
- n with xmlns='' nil '' 'n'
14
-
15
- attribute name
16
- xmlns= 'xmlns' nil nil
17
- xmlns:n= 'xmlns' nil 'n'
18
- p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
19
- n= nil '' 'n'
20
- =end
21
- def Name.parse_element_name(name, context)
22
- if /\{(.*)\}/ =~ name
23
- # "{u}n" means "use default namespace",
24
- # "p{u}n" means "use the specified prefix p"
25
- $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
26
- elsif /:/ =~ name && !context.namespace_uri($`).empty?
27
- Name.new($`, context.namespace_uri($`), $')
28
- elsif !context.namespace_uri(nil).empty?
29
- Name.new(nil, context.namespace_uri(nil), name)
30
- else
31
- Name.new(nil, '', name)
32
- end
33
- end
34
-
35
- def Name.parse_attribute_name(name, context)
36
- if name == 'xmlns'
37
- Name.new('xmlns', nil, nil)
38
- elsif /\Axmlns:/ =~ name
39
- Name.new('xmlns', nil, $')
40
- elsif /\{(.*)\}/ =~ name
41
- case $`
42
- when ''; Name.new(nil, $1, $')
43
- else Name.new($`, $1, $')
44
- end
45
- elsif /:/ =~ name && !context.namespace_uri($`).empty?
46
- Name.new($`, context.namespace_uri($`), $')
47
- else
48
- Name.new(nil, '', name)
49
- end
50
- end
51
-
52
- NameCache = {}
53
- def Name.new(namespace_prefix, namespace_uri, local_name)
54
- key = [namespace_prefix, namespace_uri, local_name, self]
55
- NameCache.fetch(key) {
56
- 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
57
- NameCache[key] = super(key[0], key[1], key[2])
58
- }
59
- end
60
-
61
- def initialize(namespace_prefix, namespace_uri, local_name)
62
- @namespace_prefix = namespace_prefix
63
- @namespace_uri = namespace_uri
64
- @local_name = local_name
65
- if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
66
- raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
67
- end
68
- if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
69
- raise HTree::Error, "invalid local name: #{@local_name.inspect}"
70
- end
71
- if @namespace_prefix == 'xmlns'
72
- unless @namespace_uri == nil
73
- raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
74
- end
75
- else
76
- unless String === @namespace_uri
77
- raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
78
- end
79
- end
80
- end
81
- attr_reader :namespace_prefix, :namespace_uri, :local_name
82
-
83
- def xmlns?
84
- @namespace_prefix == 'xmlns' && @namespace_uri == nil
85
- end
86
-
87
- def universal_name
88
- if @namespace_uri && !@namespace_uri.empty?
89
- "{#{@namespace_uri}}#{@local_name}"
90
- else
91
- @local_name.dup
92
- end
93
- end
94
-
95
- def qualified_name
96
- if @namespace_uri && !@namespace_uri.empty?
97
- if @namespace_prefix
98
- "#{@namespace_prefix}:#{@local_name}"
99
- else
100
- @local_name.dup
101
- end
102
- elsif @local_name
103
- @local_name.dup
104
- else
105
- "xmlns"
106
- end
107
- end
108
-
109
- def to_s
110
- if @namespace_uri && !@namespace_uri.empty?
111
- if @namespace_prefix
112
- "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
113
- else
114
- "{#{@namespace_uri}}#{@local_name}"
115
- end
116
- elsif @local_name
117
- @local_name.dup
118
- else
119
- "xmlns"
120
- end
121
- end
122
- end
123
- end
124
- # :startdoc:
@@ -1,207 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/encoder'
3
- require 'htree/doc'
4
- require 'htree/elem'
5
- require 'htree/leaf'
6
- require 'htree/text'
7
-
8
- module HTree # :nodoc:
9
-
10
- class Text # :nodoc:
11
- ChRef = {
12
- '>' => '&gt;',
13
- '<' => '&lt;',
14
- '"' => '&quot;',
15
- }
16
-
17
- def output(out, context)
18
- out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
- end
20
-
21
- def to_attvalue_content
22
- @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
- end
24
-
25
- def output_attvalue(out, context)
26
- out.output_string '"'
27
- out.output_text to_attvalue_content
28
- out.output_string '"'
29
- end
30
- end
31
-
32
- class Name # :nodoc:
33
- def output(out, context)
34
- # xxx: validate namespace prefix
35
- if xmlns?
36
- if @local_name
37
- out.output_string "xmlns:#{@local_name}"
38
- else
39
- out.output_string "xmlns"
40
- end
41
- else
42
- out.output_string qualified_name
43
- end
44
- end
45
-
46
- def output_attribute(text, out, context)
47
- output(out, context)
48
- out.output_string '='
49
- text.output_attvalue(out, context)
50
- end
51
- end
52
-
53
- class Doc # :nodoc:
54
- def output(out, context)
55
- context = DefaultContext # discard outer context
56
- xmldecl = false
57
- doctypedecl = false
58
- @children.each {|n|
59
- if n.respond_to? :output_prolog_xmldecl
60
- n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
61
- xmldecl = true
62
- elsif n.respond_to? :output_prolog_doctypedecl
63
- n.output_prolog_doctypedecl(out, context) unless doctypedecl
64
- doctypedecl = true
65
- else
66
- n.output(out, context)
67
- end
68
- }
69
- end
70
- end
71
-
72
- class Elem # :nodoc:
73
- def output(out, context)
74
- if @empty
75
- @stag.output_emptytag(out, context)
76
- else
77
- children_context = @stag.output_stag(out, context)
78
- @children.each {|n| n.output(out, children_context) }
79
- @stag.output_etag(out, context)
80
- end
81
- end
82
- end
83
-
84
- class STag # :nodoc:
85
- def output_attributes(out, context)
86
- @attributes.each {|aname, text|
87
- next if aname.xmlns?
88
- out.output_string ' '
89
- aname.output_attribute(text, out, context)
90
- }
91
- @context.output_namespaces(out, context)
92
- end
93
-
94
- def output_emptytag(out, context)
95
- out.output_string '<'
96
- @name.output(out, context)
97
- children_context = output_attributes(out, context)
98
- out.output_string "\n/>"
99
- children_context
100
- end
101
-
102
- def output_stag(out, context)
103
- out.output_string '<'
104
- @name.output(out, context)
105
- children_context = output_attributes(out, context)
106
- out.output_string "\n>"
107
- children_context
108
- end
109
-
110
- def output_etag(out, context)
111
- out.output_string '</'
112
- @name.output(out, context)
113
- out.output_string "\n>"
114
- end
115
- end
116
-
117
- class Context # :nodoc:
118
- def output_namespaces(out, outer_context)
119
- unknown_namespaces = {}
120
- @namespaces.each {|prefix, uri|
121
- outer_uri = outer_context.namespace_uri(prefix)
122
- if outer_uri == nil
123
- unknown_namespaces[prefix] = uri
124
- elsif outer_uri != uri
125
- if prefix
126
- out.output_string " xmlns:#{prefix}="
127
- else
128
- out.output_string " xmlns="
129
- end
130
- Text.new(uri).output_attvalue(out, outer_context)
131
- end
132
- }
133
- unless unknown_namespaces.empty?
134
- out.output_xmlns(unknown_namespaces)
135
- end
136
- outer_context.subst_namespaces(@namespaces)
137
- end
138
- end
139
-
140
- class BogusETag # :nodoc:
141
- # don't output anything.
142
- def output(out, context)
143
- end
144
- end
145
-
146
- class XMLDecl # :nodoc:
147
- # don't output anything.
148
- def output(out, context)
149
- end
150
-
151
- def output_prolog_xmldecl(out, context)
152
- out.output_string "<?xml version=\"#{@version}\""
153
- if @encoding
154
- out.output_string " encoding=\"#{@encoding}\""
155
- end
156
- if @standalone != nil
157
- out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
158
- end
159
- out.output_string "?>"
160
- end
161
- end
162
-
163
- class DocType # :nodoc:
164
- # don't output anything.
165
- def output(out, context)
166
- end
167
-
168
- def generate_content # :nodoc:
169
- result = ''
170
- if @public_identifier
171
- result << "PUBLIC \"#{@public_identifier}\""
172
- else
173
- result << "SYSTEM"
174
- end
175
- # Although a system identifier is not omissible in XML,
176
- # we cannot output it if it is not given.
177
- if @system_identifier
178
- if /"/ !~ @system_identifier
179
- result << " \"#{@system_identifier}\""
180
- else
181
- result << " '#{@system_identifier}'"
182
- end
183
- end
184
- result
185
- end
186
-
187
- def output_prolog_doctypedecl(out, context)
188
- out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
189
- end
190
- end
191
-
192
- class ProcIns # :nodoc:
193
- def output(out, context)
194
- out.output_string "<?#{@target}"
195
- out.output_string " #{@content}" if @content
196
- out.output_string "?>"
197
- end
198
- end
199
-
200
- class Comment # :nodoc:
201
- def output(out, context)
202
- out.output_string "<!--#{@content}-->"
203
- end
204
- end
205
-
206
- end
207
- # :startdoc:
@@ -1,409 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/scan'
3
- require 'htree/htmlinfo'
4
- require 'htree/text'
5
- require 'htree/tag'
6
- require 'htree/leaf'
7
- require 'htree/doc'
8
- require 'htree/elem'
9
- require 'htree/raw_string'
10
- require 'htree/context'
11
- require 'htree/encoder'
12
- require 'htree/fstr'
13
-
14
- module HTree # :nodoc:
15
- # HTree.parse parses <i>input</i> and return a document tree.
16
- # represented by HTree::Doc.
17
- #
18
- # <i>input</i> should be a String or
19
- # an object which respond to read or open method.
20
- # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
21
- # Note that the URIs need open-uri.
22
- #
23
- # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
24
- #
25
- # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
26
- # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
27
- #
28
- # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
29
- #
30
- # If opened file or read content has charset method,
31
- # HTree.parse decode it according to $KCODE before parsing.
32
- # Otherwise HTree.parse assumes the character encoding of the content is
33
- # compatible to $KCODE.
34
- # Note that the charset method is provided by URI::HTTP with open-uri.
35
- def HTree.parse(input)
36
- HTree.with_frozen_string_hash {
37
- parse_as(input, false)
38
- }
39
- end
40
-
41
- # HTree.parse_xml parses <i>input</i> as XML and
42
- # return a document tree represented by HTree::Doc.
43
- #
44
- # It behaves almost same as HTree.parse but it assumes <i>input</> is XML
45
- # even if no XML declaration.
46
- # The assumption causes following differences.
47
- # * doesn't downcase element name.
48
- # * The content of <script> and <style> element is PCDATA, not CDATA.
49
- def HTree.parse_xml(input)
50
- HTree.with_frozen_string_hash {
51
- parse_as(input, true)
52
- }
53
- end
54
-
55
- def HTree.parse_as(input, is_xml)
56
- input_charset = nil
57
- if input.tainted? && 1 <= $SAFE
58
- raise SecurityError, "input tainted"
59
- end
60
- if input.respond_to? :read # IO, StringIO
61
- input = input.read.untaint
62
- input_charset = input.charset if input.respond_to? :charset
63
- elsif input.respond_to? :open # Pathname, URI with open-uri
64
- input.open {|f|
65
- input = f.read.untaint
66
- input_charset = f.charset if f.respond_to? :charset
67
- }
68
- end
69
- if input_charset && input_charset != Encoder.internal_charset
70
- input = Iconv.conv(Encoder.internal_charset, input_charset, input)
71
- end
72
-
73
- tokens = []
74
- is_xml, is_html = HTree.scan(input, is_xml) {|token|
75
- tokens << token
76
- }
77
- context = is_html ? HTMLContext: DefaultContext
78
- structure_list = parse_pairs(tokens, is_xml, is_html)
79
- structure_list = fix_structure_list(structure_list, is_xml, is_html)
80
- nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
81
- Doc.new(nodes)
82
- end
83
-
84
- def HTree.parse_pairs(tokens, is_xml, is_html)
85
- stack = [[nil, nil, []]]
86
- tokens.each {|token|
87
- case token[0]
88
- when :stag
89
- stag_raw_string = token[1]
90
- stagname = stag_raw_string[Pat::Name]
91
- stagname = stagname.downcase if !is_xml && is_html
92
- stagname = HTree.frozen_string(stagname)
93
- stack << [stagname, stag_raw_string, []]
94
- when :etag
95
- etag_raw_string = token[1]
96
- etagname = etag_raw_string[Pat::Name]
97
- etagname = etagname.downcase if !is_xml && is_html
98
- etagname = HTree.frozen_string(etagname)
99
- matched_elem = nil
100
- stack.reverse_each {|elem|
101
- stagname, _, _ = elem
102
- if stagname == etagname
103
- matched_elem = elem
104
- break
105
- end
106
- }
107
- if matched_elem
108
- # This line breaks in Rails 1.1.
109
- #until matched_elem.equal? stack.last
110
- until matched_elem.object_id == stack.last.object_id
111
- stagname, stag_raw_string, children = stack.pop
112
- stack.last[2] << [:elem, stag_raw_string, children]
113
- end
114
- stagname, stag_raw_string, children = stack.pop
115
- stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
116
- else
117
- stack.last[2] << [:bogus_etag, etag_raw_string]
118
- end
119
- else
120
- stack.last[2] << token
121
- end
122
- }
123
- elem = nil
124
- while 1 < stack.length
125
- stagname, stag_raw_string, children = stack.pop
126
- stack.last[2] << [:elem, stag_raw_string, children]
127
- end
128
- stack[0][2]
129
- end
130
-
131
- def HTree.fix_structure_list(structure_list, is_xml, is_html)
132
- result = []
133
- rest = structure_list.dup
134
- until rest.empty?
135
- structure = rest.shift
136
- if structure[0] == :elem
137
- elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
138
- result << elem
139
- rest = rest2 + rest
140
- else
141
- result << structure
142
- end
143
- end
144
- result
145
- end
146
-
147
- def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
148
- stag_raw_string = elem[1]
149
- children = elem[2]
150
- if etag_raw_string = elem[3]
151
- return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
152
- else
153
- tagname = stag_raw_string[Pat::Name]
154
- tagname = tagname.downcase if !is_xml && is_html
155
- if ElementContent[tagname] == :EMPTY
156
- return [:elem, stag_raw_string, []], children
157
- else
158
- if ElementContent[tagname] == :CDATA
159
- possible_tags = []
160
- else
161
- possible_tags = ElementContent[tagname]
162
- end
163
- if possible_tags
164
- excluded_tags2 = ElementExclusions[tagname]
165
- included_tags2 = ElementInclusions[tagname]
166
- excluded_tags |= excluded_tags2 if excluded_tags2
167
- included_tags |= included_tags2 if included_tags2
168
- containable_tags = (possible_tags | included_tags) - excluded_tags
169
- uncontainable_tags = ElementContent.keys - containable_tags
170
- else
171
- # If the tagname is unknown, it is assumed that any element
172
- # except excluded can be contained.
173
- uncontainable_tags = excluded_tags
174
- end
175
- fixed_children = []
176
- rest = children
177
- until rest.empty?
178
- if rest[0][0] == :elem
179
- elem = rest.shift
180
- elem_tagname = elem[1][Pat::Name]
181
- elem_tagname = elem_tagname.downcase if !is_xml && is_html
182
- if uncontainable_tags.include? elem_tagname
183
- rest.unshift elem
184
- break
185
- else
186
- fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
187
- fixed_children << fixed_elem
188
- rest = rest2 + rest
189
- end
190
- else
191
- fixed_children << rest.shift
192
- end
193
- end
194
- return [:elem, stag_raw_string, fixed_children], rest
195
- end
196
- end
197
- end
198
-
199
- def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
200
- case structure[0]
201
- when :text_pcdata
202
- Text.parse_pcdata(structure[1])
203
- when :elem
204
- _, stag_rawstring, children, etag_rawstring = structure
205
- etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
206
- stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
207
- if !children.empty? || etag
208
- Elem.new!(stag,
209
- children.map {|c| build_node(c, is_xml, is_html, stag.context) },
210
- etag)
211
- else
212
- Elem.new!(stag)
213
- end
214
- when :emptytag
215
- Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
216
- when :bogus_etag
217
- BogusETag.parse(structure[1], is_xml, is_html)
218
- when :xmldecl
219
- XMLDecl.parse(structure[1])
220
- when :doctype
221
- DocType.parse(structure[1], is_xml, is_html)
222
- when :procins
223
- ProcIns.parse(structure[1])
224
- when :comment
225
- Comment.parse(structure[1])
226
- when :text_cdata_content
227
- Text.parse_cdata_content(structure[1])
228
- when :text_cdata_section
229
- Text.parse_cdata_section(structure[1])
230
- else
231
- raise Exception, "[bug] unknown structure: #{structure.inspect}"
232
- end
233
- end
234
-
235
- def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
236
- attrs = []
237
- if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
238
- qname = $1
239
- $2.scan(Pat::ValidAttr_C) {
240
- attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
241
- }
242
- elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
243
- qname = $1
244
- last_attr = $3
245
- $2.scan(Pat::InvalidAttr1_C) {
246
- attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
247
- }
248
- if last_attr
249
- /#{Pat::InvalidAttr1End_C}/o =~ last_attr
250
- attrs << [$1, $2 || $3]
251
- end
252
- else
253
- raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
254
- end
255
-
256
- qname = qname.downcase if !is_xml && is_html
257
-
258
- attrs.map! {|aname, aval|
259
- if aname
260
- aname = (!is_xml && is_html) ? aname.downcase : aname
261
- [aname, Text.parse_pcdata(aval)]
262
- else
263
- if val2name = OmittedAttrName[qname]
264
- aval_downcase = aval.downcase
265
- aname = val2name.fetch(aval_downcase, aval_downcase)
266
- else
267
- aname = aval
268
- end
269
- [aname, Text.new(aval)]
270
- end
271
- }
272
-
273
- result = STag.new(qname, attrs, inherited_context)
274
- result.raw_string = raw_string
275
- result
276
- end
277
-
278
- def ETag.parse(raw_string, is_xml, is_html)
279
- unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
280
- raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
281
- end
282
-
283
- qname = $1
284
- qname = qname.downcase if !is_xml && is_html
285
-
286
- result = self.new(qname)
287
- result.raw_string = raw_string
288
- result
289
- end
290
-
291
- def BogusETag.parse(raw_string, is_xml, is_html)
292
- unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
293
- raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
294
- end
295
-
296
- qname = $1
297
- qname = qname.downcase if !is_xml && is_html
298
-
299
- result = self.new(qname)
300
- result.raw_string = raw_string
301
- result
302
- end
303
-
304
- def Text.parse_pcdata(raw_string)
305
- fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
306
- name = $1
307
- case s
308
- when /;\z/
309
- s
310
- when /\A&#/
311
- "#{s};"
312
- when '&'
313
- '&amp;'
314
- else
315
- if NamedCharactersPattern =~ name
316
- "&#{name};"
317
- else
318
- "&amp;#{name}"
319
- end
320
- end
321
- }
322
- fixed = raw_string if fixed == raw_string
323
- result = Text.new_internal(fixed)
324
- result.raw_string = raw_string
325
- result
326
- end
327
-
328
- def Text.parse_cdata_content(raw_string)
329
- result = Text.new(raw_string)
330
- result.raw_string = raw_string
331
- result
332
- end
333
-
334
- def Text.parse_cdata_section(raw_string)
335
- unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
336
- raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
337
- end
338
-
339
- content = $1
340
-
341
- result = Text.new(content)
342
- result.raw_string = raw_string
343
- result
344
- end
345
-
346
- def XMLDecl.parse(raw_string)
347
- unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
348
- raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
349
- end
350
-
351
- version = $1 || $2
352
- encoding = $3 || $4
353
- case $5 || $6
354
- when 'yes'
355
- standalone = true
356
- when 'no'
357
- standalone = false
358
- else
359
- standalone = nil
360
- end
361
-
362
- result = XMLDecl.new(version, encoding, standalone)
363
- result.raw_string = raw_string
364
- result
365
- end
366
-
367
- def DocType.parse(raw_string, is_xml, is_html)
368
- unless /\A#{Pat::DocType_C}\z/o =~ raw_string
369
- raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
370
- end
371
-
372
- root_element_name = $1
373
- public_identifier = $2 || $3
374
- system_identifier = $4 || $5
375
-
376
- root_element_name = root_element_name.downcase if !is_xml && is_html
377
-
378
- result = DocType.new(root_element_name, public_identifier, system_identifier)
379
- result.raw_string = raw_string
380
- result
381
- end
382
-
383
- def ProcIns.parse(raw_string)
384
- unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
385
- raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
386
- end
387
-
388
- target = $1
389
- content = $2
390
-
391
- result = ProcIns.new(target, content)
392
- result.raw_string = raw_string
393
- result
394
- end
395
-
396
- def Comment.parse(raw_string)
397
- unless /\A#{Pat::Comment_C}\z/o =~ raw_string
398
- raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
399
- end
400
-
401
- content = $1
402
-
403
- result = Comment.new(content)
404
- result.raw_string = raw_string
405
- result
406
- end
407
-
408
- end
409
- # :startdoc: