feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -1,48 +0,0 @@
1
- # :stopdoc:
2
- module HTree
3
- class Name; include HTree end
4
- class Context; include HTree end
5
-
6
- module Tag; include HTree end
7
- class STag; include Tag end
8
- class ETag; include Tag end
9
-
10
- module Node; include HTree end
11
- module Container; include Node end
12
- class Doc; include Container end
13
- class Elem; include Container end
14
- module Leaf; include Node end
15
- class Text; include Leaf end
16
- class XMLDecl; include Leaf end
17
- class DocType; include Leaf end
18
- class ProcIns; include Leaf end
19
- class Comment; include Leaf end
20
- class BogusETag; include Leaf end
21
-
22
- module Traverse end
23
- module Container::Trav; include Traverse end
24
- module Leaf::Trav; include Traverse end
25
- class Doc; module Trav; include Container::Trav end; include Trav end
26
- class Elem; module Trav; include Container::Trav end; include Trav end
27
- class Text; module Trav; include Leaf::Trav end; include Trav end
28
- class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
29
- class DocType; module Trav; include Leaf::Trav end; include Trav end
30
- class ProcIns; module Trav; include Leaf::Trav end; include Trav end
31
- class Comment; module Trav; include Leaf::Trav end; include Trav end
32
- class BogusETag; module Trav; include Leaf::Trav end; include Trav end
33
-
34
- class Location; include HTree end
35
- module Container::Loc end
36
- module Leaf::Loc end
37
- class Doc; class Loc < Location; include Trav, Container::Loc end end
38
- class Elem; class Loc < Location; include Trav, Container::Loc end end
39
- class Text; class Loc < Location; include Trav, Leaf::Loc end end
40
- class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
41
- class DocType; class Loc < Location; include Trav, Leaf::Loc end end
42
- class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
43
- class Comment; class Loc < Location; include Trav, Leaf::Loc end end
44
- class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
45
-
46
- class Error < StandardError; end
47
- end
48
- # :startdoc:
@@ -1,124 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/scan' # for Pat::Nmtoken
3
- require 'htree/context'
4
-
5
- module HTree # :nodoc:
6
- # Name represents a element name and attribute name.
7
- # It consists of a namespace prefix, a namespace URI and a local name.
8
- class Name # :nodoc:
9
- =begin
10
- element name prefix uri localname
11
- {u}n, n with xmlns=u nil 'u' 'n'
12
- p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
13
- n with xmlns='' nil '' 'n'
14
-
15
- attribute name
16
- xmlns= 'xmlns' nil nil
17
- xmlns:n= 'xmlns' nil 'n'
18
- p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
19
- n= nil '' 'n'
20
- =end
21
- def Name.parse_element_name(name, context)
22
- if /\{(.*)\}/ =~ name
23
- # "{u}n" means "use default namespace",
24
- # "p{u}n" means "use the specified prefix p"
25
- $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
26
- elsif /:/ =~ name && !context.namespace_uri($`).empty?
27
- Name.new($`, context.namespace_uri($`), $')
28
- elsif !context.namespace_uri(nil).empty?
29
- Name.new(nil, context.namespace_uri(nil), name)
30
- else
31
- Name.new(nil, '', name)
32
- end
33
- end
34
-
35
- def Name.parse_attribute_name(name, context)
36
- if name == 'xmlns'
37
- Name.new('xmlns', nil, nil)
38
- elsif /\Axmlns:/ =~ name
39
- Name.new('xmlns', nil, $')
40
- elsif /\{(.*)\}/ =~ name
41
- case $`
42
- when ''; Name.new(nil, $1, $')
43
- else Name.new($`, $1, $')
44
- end
45
- elsif /:/ =~ name && !context.namespace_uri($`).empty?
46
- Name.new($`, context.namespace_uri($`), $')
47
- else
48
- Name.new(nil, '', name)
49
- end
50
- end
51
-
52
- NameCache = {}
53
- def Name.new(namespace_prefix, namespace_uri, local_name)
54
- key = [namespace_prefix, namespace_uri, local_name, self]
55
- NameCache.fetch(key) {
56
- 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
57
- NameCache[key] = super(key[0], key[1], key[2])
58
- }
59
- end
60
-
61
- def initialize(namespace_prefix, namespace_uri, local_name)
62
- @namespace_prefix = namespace_prefix
63
- @namespace_uri = namespace_uri
64
- @local_name = local_name
65
- if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
66
- raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
67
- end
68
- if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
69
- raise HTree::Error, "invalid local name: #{@local_name.inspect}"
70
- end
71
- if @namespace_prefix == 'xmlns'
72
- unless @namespace_uri == nil
73
- raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
74
- end
75
- else
76
- unless String === @namespace_uri
77
- raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
78
- end
79
- end
80
- end
81
- attr_reader :namespace_prefix, :namespace_uri, :local_name
82
-
83
- def xmlns?
84
- @namespace_prefix == 'xmlns' && @namespace_uri == nil
85
- end
86
-
87
- def universal_name
88
- if @namespace_uri && !@namespace_uri.empty?
89
- "{#{@namespace_uri}}#{@local_name}"
90
- else
91
- @local_name.dup
92
- end
93
- end
94
-
95
- def qualified_name
96
- if @namespace_uri && !@namespace_uri.empty?
97
- if @namespace_prefix
98
- "#{@namespace_prefix}:#{@local_name}"
99
- else
100
- @local_name.dup
101
- end
102
- elsif @local_name
103
- @local_name.dup
104
- else
105
- "xmlns"
106
- end
107
- end
108
-
109
- def to_s
110
- if @namespace_uri && !@namespace_uri.empty?
111
- if @namespace_prefix
112
- "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
113
- else
114
- "{#{@namespace_uri}}#{@local_name}"
115
- end
116
- elsif @local_name
117
- @local_name.dup
118
- else
119
- "xmlns"
120
- end
121
- end
122
- end
123
- end
124
- # :startdoc:
@@ -1,207 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/encoder'
3
- require 'htree/doc'
4
- require 'htree/elem'
5
- require 'htree/leaf'
6
- require 'htree/text'
7
-
8
- module HTree # :nodoc:
9
-
10
- class Text # :nodoc:
11
- ChRef = {
12
- '>' => '&gt;',
13
- '<' => '&lt;',
14
- '"' => '&quot;',
15
- }
16
-
17
- def output(out, context)
18
- out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
- end
20
-
21
- def to_attvalue_content
22
- @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
- end
24
-
25
- def output_attvalue(out, context)
26
- out.output_string '"'
27
- out.output_text to_attvalue_content
28
- out.output_string '"'
29
- end
30
- end
31
-
32
- class Name # :nodoc:
33
- def output(out, context)
34
- # xxx: validate namespace prefix
35
- if xmlns?
36
- if @local_name
37
- out.output_string "xmlns:#{@local_name}"
38
- else
39
- out.output_string "xmlns"
40
- end
41
- else
42
- out.output_string qualified_name
43
- end
44
- end
45
-
46
- def output_attribute(text, out, context)
47
- output(out, context)
48
- out.output_string '='
49
- text.output_attvalue(out, context)
50
- end
51
- end
52
-
53
- class Doc # :nodoc:
54
- def output(out, context)
55
- context = DefaultContext # discard outer context
56
- xmldecl = false
57
- doctypedecl = false
58
- @children.each {|n|
59
- if n.respond_to? :output_prolog_xmldecl
60
- n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
61
- xmldecl = true
62
- elsif n.respond_to? :output_prolog_doctypedecl
63
- n.output_prolog_doctypedecl(out, context) unless doctypedecl
64
- doctypedecl = true
65
- else
66
- n.output(out, context)
67
- end
68
- }
69
- end
70
- end
71
-
72
- class Elem # :nodoc:
73
- def output(out, context)
74
- if @empty
75
- @stag.output_emptytag(out, context)
76
- else
77
- children_context = @stag.output_stag(out, context)
78
- @children.each {|n| n.output(out, children_context) }
79
- @stag.output_etag(out, context)
80
- end
81
- end
82
- end
83
-
84
- class STag # :nodoc:
85
- def output_attributes(out, context)
86
- @attributes.each {|aname, text|
87
- next if aname.xmlns?
88
- out.output_string ' '
89
- aname.output_attribute(text, out, context)
90
- }
91
- @context.output_namespaces(out, context)
92
- end
93
-
94
- def output_emptytag(out, context)
95
- out.output_string '<'
96
- @name.output(out, context)
97
- children_context = output_attributes(out, context)
98
- out.output_string "\n/>"
99
- children_context
100
- end
101
-
102
- def output_stag(out, context)
103
- out.output_string '<'
104
- @name.output(out, context)
105
- children_context = output_attributes(out, context)
106
- out.output_string "\n>"
107
- children_context
108
- end
109
-
110
- def output_etag(out, context)
111
- out.output_string '</'
112
- @name.output(out, context)
113
- out.output_string "\n>"
114
- end
115
- end
116
-
117
- class Context # :nodoc:
118
- def output_namespaces(out, outer_context)
119
- unknown_namespaces = {}
120
- @namespaces.each {|prefix, uri|
121
- outer_uri = outer_context.namespace_uri(prefix)
122
- if outer_uri == nil
123
- unknown_namespaces[prefix] = uri
124
- elsif outer_uri != uri
125
- if prefix
126
- out.output_string " xmlns:#{prefix}="
127
- else
128
- out.output_string " xmlns="
129
- end
130
- Text.new(uri).output_attvalue(out, outer_context)
131
- end
132
- }
133
- unless unknown_namespaces.empty?
134
- out.output_xmlns(unknown_namespaces)
135
- end
136
- outer_context.subst_namespaces(@namespaces)
137
- end
138
- end
139
-
140
- class BogusETag # :nodoc:
141
- # don't output anything.
142
- def output(out, context)
143
- end
144
- end
145
-
146
- class XMLDecl # :nodoc:
147
- # don't output anything.
148
- def output(out, context)
149
- end
150
-
151
- def output_prolog_xmldecl(out, context)
152
- out.output_string "<?xml version=\"#{@version}\""
153
- if @encoding
154
- out.output_string " encoding=\"#{@encoding}\""
155
- end
156
- if @standalone != nil
157
- out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
158
- end
159
- out.output_string "?>"
160
- end
161
- end
162
-
163
- class DocType # :nodoc:
164
- # don't output anything.
165
- def output(out, context)
166
- end
167
-
168
- def generate_content # :nodoc:
169
- result = ''
170
- if @public_identifier
171
- result << "PUBLIC \"#{@public_identifier}\""
172
- else
173
- result << "SYSTEM"
174
- end
175
- # Although a system identifier is not omissible in XML,
176
- # we cannot output it if it is not given.
177
- if @system_identifier
178
- if /"/ !~ @system_identifier
179
- result << " \"#{@system_identifier}\""
180
- else
181
- result << " '#{@system_identifier}'"
182
- end
183
- end
184
- result
185
- end
186
-
187
- def output_prolog_doctypedecl(out, context)
188
- out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
189
- end
190
- end
191
-
192
- class ProcIns # :nodoc:
193
- def output(out, context)
194
- out.output_string "<?#{@target}"
195
- out.output_string " #{@content}" if @content
196
- out.output_string "?>"
197
- end
198
- end
199
-
200
- class Comment # :nodoc:
201
- def output(out, context)
202
- out.output_string "<!--#{@content}-->"
203
- end
204
- end
205
-
206
- end
207
- # :startdoc:
@@ -1,409 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/scan'
3
- require 'htree/htmlinfo'
4
- require 'htree/text'
5
- require 'htree/tag'
6
- require 'htree/leaf'
7
- require 'htree/doc'
8
- require 'htree/elem'
9
- require 'htree/raw_string'
10
- require 'htree/context'
11
- require 'htree/encoder'
12
- require 'htree/fstr'
13
-
14
- module HTree # :nodoc:
15
- # HTree.parse parses <i>input</i> and return a document tree.
16
- # represented by HTree::Doc.
17
- #
18
- # <i>input</i> should be a String or
19
- # an object which respond to read or open method.
20
- # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
21
- # Note that the URIs need open-uri.
22
- #
23
- # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
24
- #
25
- # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
26
- # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
27
- #
28
- # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
29
- #
30
- # If opened file or read content has charset method,
31
- # HTree.parse decode it according to $KCODE before parsing.
32
- # Otherwise HTree.parse assumes the character encoding of the content is
33
- # compatible to $KCODE.
34
- # Note that the charset method is provided by URI::HTTP with open-uri.
35
- def HTree.parse(input)
36
- HTree.with_frozen_string_hash {
37
- parse_as(input, false)
38
- }
39
- end
40
-
41
- # HTree.parse_xml parses <i>input</i> as XML and
42
- # return a document tree represented by HTree::Doc.
43
- #
44
- # It behaves almost same as HTree.parse but it assumes <i>input</> is XML
45
- # even if no XML declaration.
46
- # The assumption causes following differences.
47
- # * doesn't downcase element name.
48
- # * The content of <script> and <style> element is PCDATA, not CDATA.
49
- def HTree.parse_xml(input)
50
- HTree.with_frozen_string_hash {
51
- parse_as(input, true)
52
- }
53
- end
54
-
55
- def HTree.parse_as(input, is_xml)
56
- input_charset = nil
57
- if input.tainted? && 1 <= $SAFE
58
- raise SecurityError, "input tainted"
59
- end
60
- if input.respond_to? :read # IO, StringIO
61
- input = input.read.untaint
62
- input_charset = input.charset if input.respond_to? :charset
63
- elsif input.respond_to? :open # Pathname, URI with open-uri
64
- input.open {|f|
65
- input = f.read.untaint
66
- input_charset = f.charset if f.respond_to? :charset
67
- }
68
- end
69
- if input_charset && input_charset != Encoder.internal_charset
70
- input = Iconv.conv(Encoder.internal_charset, input_charset, input)
71
- end
72
-
73
- tokens = []
74
- is_xml, is_html = HTree.scan(input, is_xml) {|token|
75
- tokens << token
76
- }
77
- context = is_html ? HTMLContext: DefaultContext
78
- structure_list = parse_pairs(tokens, is_xml, is_html)
79
- structure_list = fix_structure_list(structure_list, is_xml, is_html)
80
- nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
81
- Doc.new(nodes)
82
- end
83
-
84
- def HTree.parse_pairs(tokens, is_xml, is_html)
85
- stack = [[nil, nil, []]]
86
- tokens.each {|token|
87
- case token[0]
88
- when :stag
89
- stag_raw_string = token[1]
90
- stagname = stag_raw_string[Pat::Name]
91
- stagname = stagname.downcase if !is_xml && is_html
92
- stagname = HTree.frozen_string(stagname)
93
- stack << [stagname, stag_raw_string, []]
94
- when :etag
95
- etag_raw_string = token[1]
96
- etagname = etag_raw_string[Pat::Name]
97
- etagname = etagname.downcase if !is_xml && is_html
98
- etagname = HTree.frozen_string(etagname)
99
- matched_elem = nil
100
- stack.reverse_each {|elem|
101
- stagname, _, _ = elem
102
- if stagname == etagname
103
- matched_elem = elem
104
- break
105
- end
106
- }
107
- if matched_elem
108
- # This line breaks in Rails 1.1.
109
- #until matched_elem.equal? stack.last
110
- until matched_elem.object_id == stack.last.object_id
111
- stagname, stag_raw_string, children = stack.pop
112
- stack.last[2] << [:elem, stag_raw_string, children]
113
- end
114
- stagname, stag_raw_string, children = stack.pop
115
- stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
116
- else
117
- stack.last[2] << [:bogus_etag, etag_raw_string]
118
- end
119
- else
120
- stack.last[2] << token
121
- end
122
- }
123
- elem = nil
124
- while 1 < stack.length
125
- stagname, stag_raw_string, children = stack.pop
126
- stack.last[2] << [:elem, stag_raw_string, children]
127
- end
128
- stack[0][2]
129
- end
130
-
131
- def HTree.fix_structure_list(structure_list, is_xml, is_html)
132
- result = []
133
- rest = structure_list.dup
134
- until rest.empty?
135
- structure = rest.shift
136
- if structure[0] == :elem
137
- elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
138
- result << elem
139
- rest = rest2 + rest
140
- else
141
- result << structure
142
- end
143
- end
144
- result
145
- end
146
-
147
- def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
148
- stag_raw_string = elem[1]
149
- children = elem[2]
150
- if etag_raw_string = elem[3]
151
- return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
152
- else
153
- tagname = stag_raw_string[Pat::Name]
154
- tagname = tagname.downcase if !is_xml && is_html
155
- if ElementContent[tagname] == :EMPTY
156
- return [:elem, stag_raw_string, []], children
157
- else
158
- if ElementContent[tagname] == :CDATA
159
- possible_tags = []
160
- else
161
- possible_tags = ElementContent[tagname]
162
- end
163
- if possible_tags
164
- excluded_tags2 = ElementExclusions[tagname]
165
- included_tags2 = ElementInclusions[tagname]
166
- excluded_tags |= excluded_tags2 if excluded_tags2
167
- included_tags |= included_tags2 if included_tags2
168
- containable_tags = (possible_tags | included_tags) - excluded_tags
169
- uncontainable_tags = ElementContent.keys - containable_tags
170
- else
171
- # If the tagname is unknown, it is assumed that any element
172
- # except excluded can be contained.
173
- uncontainable_tags = excluded_tags
174
- end
175
- fixed_children = []
176
- rest = children
177
- until rest.empty?
178
- if rest[0][0] == :elem
179
- elem = rest.shift
180
- elem_tagname = elem[1][Pat::Name]
181
- elem_tagname = elem_tagname.downcase if !is_xml && is_html
182
- if uncontainable_tags.include? elem_tagname
183
- rest.unshift elem
184
- break
185
- else
186
- fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
187
- fixed_children << fixed_elem
188
- rest = rest2 + rest
189
- end
190
- else
191
- fixed_children << rest.shift
192
- end
193
- end
194
- return [:elem, stag_raw_string, fixed_children], rest
195
- end
196
- end
197
- end
198
-
199
- def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
200
- case structure[0]
201
- when :text_pcdata
202
- Text.parse_pcdata(structure[1])
203
- when :elem
204
- _, stag_rawstring, children, etag_rawstring = structure
205
- etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
206
- stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
207
- if !children.empty? || etag
208
- Elem.new!(stag,
209
- children.map {|c| build_node(c, is_xml, is_html, stag.context) },
210
- etag)
211
- else
212
- Elem.new!(stag)
213
- end
214
- when :emptytag
215
- Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
216
- when :bogus_etag
217
- BogusETag.parse(structure[1], is_xml, is_html)
218
- when :xmldecl
219
- XMLDecl.parse(structure[1])
220
- when :doctype
221
- DocType.parse(structure[1], is_xml, is_html)
222
- when :procins
223
- ProcIns.parse(structure[1])
224
- when :comment
225
- Comment.parse(structure[1])
226
- when :text_cdata_content
227
- Text.parse_cdata_content(structure[1])
228
- when :text_cdata_section
229
- Text.parse_cdata_section(structure[1])
230
- else
231
- raise Exception, "[bug] unknown structure: #{structure.inspect}"
232
- end
233
- end
234
-
235
- def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
236
- attrs = []
237
- if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
238
- qname = $1
239
- $2.scan(Pat::ValidAttr_C) {
240
- attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
241
- }
242
- elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
243
- qname = $1
244
- last_attr = $3
245
- $2.scan(Pat::InvalidAttr1_C) {
246
- attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
247
- }
248
- if last_attr
249
- /#{Pat::InvalidAttr1End_C}/o =~ last_attr
250
- attrs << [$1, $2 || $3]
251
- end
252
- else
253
- raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
254
- end
255
-
256
- qname = qname.downcase if !is_xml && is_html
257
-
258
- attrs.map! {|aname, aval|
259
- if aname
260
- aname = (!is_xml && is_html) ? aname.downcase : aname
261
- [aname, Text.parse_pcdata(aval)]
262
- else
263
- if val2name = OmittedAttrName[qname]
264
- aval_downcase = aval.downcase
265
- aname = val2name.fetch(aval_downcase, aval_downcase)
266
- else
267
- aname = aval
268
- end
269
- [aname, Text.new(aval)]
270
- end
271
- }
272
-
273
- result = STag.new(qname, attrs, inherited_context)
274
- result.raw_string = raw_string
275
- result
276
- end
277
-
278
- def ETag.parse(raw_string, is_xml, is_html)
279
- unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
280
- raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
281
- end
282
-
283
- qname = $1
284
- qname = qname.downcase if !is_xml && is_html
285
-
286
- result = self.new(qname)
287
- result.raw_string = raw_string
288
- result
289
- end
290
-
291
- def BogusETag.parse(raw_string, is_xml, is_html)
292
- unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
293
- raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
294
- end
295
-
296
- qname = $1
297
- qname = qname.downcase if !is_xml && is_html
298
-
299
- result = self.new(qname)
300
- result.raw_string = raw_string
301
- result
302
- end
303
-
304
- def Text.parse_pcdata(raw_string)
305
- fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
306
- name = $1
307
- case s
308
- when /;\z/
309
- s
310
- when /\A&#/
311
- "#{s};"
312
- when '&'
313
- '&amp;'
314
- else
315
- if NamedCharactersPattern =~ name
316
- "&#{name};"
317
- else
318
- "&amp;#{name}"
319
- end
320
- end
321
- }
322
- fixed = raw_string if fixed == raw_string
323
- result = Text.new_internal(fixed)
324
- result.raw_string = raw_string
325
- result
326
- end
327
-
328
- def Text.parse_cdata_content(raw_string)
329
- result = Text.new(raw_string)
330
- result.raw_string = raw_string
331
- result
332
- end
333
-
334
- def Text.parse_cdata_section(raw_string)
335
- unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
336
- raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
337
- end
338
-
339
- content = $1
340
-
341
- result = Text.new(content)
342
- result.raw_string = raw_string
343
- result
344
- end
345
-
346
- def XMLDecl.parse(raw_string)
347
- unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
348
- raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
349
- end
350
-
351
- version = $1 || $2
352
- encoding = $3 || $4
353
- case $5 || $6
354
- when 'yes'
355
- standalone = true
356
- when 'no'
357
- standalone = false
358
- else
359
- standalone = nil
360
- end
361
-
362
- result = XMLDecl.new(version, encoding, standalone)
363
- result.raw_string = raw_string
364
- result
365
- end
366
-
367
- def DocType.parse(raw_string, is_xml, is_html)
368
- unless /\A#{Pat::DocType_C}\z/o =~ raw_string
369
- raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
370
- end
371
-
372
- root_element_name = $1
373
- public_identifier = $2 || $3
374
- system_identifier = $4 || $5
375
-
376
- root_element_name = root_element_name.downcase if !is_xml && is_html
377
-
378
- result = DocType.new(root_element_name, public_identifier, system_identifier)
379
- result.raw_string = raw_string
380
- result
381
- end
382
-
383
- def ProcIns.parse(raw_string)
384
- unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
385
- raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
386
- end
387
-
388
- target = $1
389
- content = $2
390
-
391
- result = ProcIns.new(target, content)
392
- result.raw_string = raw_string
393
- result
394
- end
395
-
396
- def Comment.parse(raw_string)
397
- unless /\A#{Pat::Comment_C}\z/o =~ raw_string
398
- raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
399
- end
400
-
401
- content = $1
402
-
403
- result = Comment.new(content)
404
- result.raw_string = raw_string
405
- result
406
- end
407
-
408
- end
409
- # :startdoc: