feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -1,124 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/modules'
3
- require 'htree/fstr'
4
-
5
- module HTree # :nodoc:
6
- module Node # :nodoc:
7
- # raw_string returns a source string recorded by parsing.
8
- # It returns +nil+ if the node is constructed not via parsing.
9
- def raw_string
10
- catch(:raw_string_tag) {
11
- return raw_string_internal('')
12
- }
13
- nil
14
- end
15
- end
16
-
17
- class Doc # :nodoc:
18
- def raw_string_internal(result)
19
- @children.each {|n|
20
- n.raw_string_internal(result)
21
- }
22
- end
23
- end
24
-
25
- class Elem # :nodoc:
26
- def raw_string_internal(result)
27
- @stag.raw_string_internal(result)
28
- @children.each {|n| n.raw_string_internal(result) }
29
- @etag.raw_string_internal(result) if @etag
30
- end
31
- end
32
-
33
- module Tag # :nodoc:
34
- def init_raw_string() @raw_string = nil end
35
- def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
36
- def raw_string_internal(result)
37
- throw :raw_string_tag if !@raw_string
38
- result << @raw_string
39
- end
40
- end
41
-
42
- module Leaf # :nodoc:
43
- def init_raw_string() @raw_string = nil end
44
- def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
45
- def raw_string_internal(result)
46
- throw :raw_string_tag if !@raw_string
47
- result << @raw_string
48
- end
49
- end
50
-
51
- class Text # :nodoc:
52
- def raw_string=(arg)
53
- if arg == @rcdata then
54
- @raw_string = @rcdata
55
- else
56
- super
57
- end
58
- end
59
- end
60
-
61
- module Node # :nodoc:
62
- def eliminate_raw_string
63
- raise NotImplementedError
64
- end
65
- end
66
-
67
- class Doc # :nodoc:
68
- def eliminate_raw_string
69
- Doc.new(@children.map {|c| c.eliminate_raw_string })
70
- end
71
- end
72
-
73
- class Elem # :nodoc:
74
- def eliminate_raw_string
75
- Elem.new!(
76
- @stag.eliminate_raw_string,
77
- @empty ? nil : @children.map {|c| c.eliminate_raw_string },
78
- @etag && @etag.eliminate_raw_string)
79
- end
80
- end
81
-
82
- class Text # :nodoc:
83
- def eliminate_raw_string
84
- Text.new_internal(@rcdata)
85
- end
86
- end
87
-
88
- class STag # :nodoc:
89
- def eliminate_raw_string
90
- STag.new(@qualified_name, @attributes, @inherited_context)
91
- end
92
- end
93
-
94
- class ETag # :nodoc:
95
- def eliminate_raw_string
96
- self.class.new(@qualified_name)
97
- end
98
- end
99
-
100
- class XMLDecl # :nodoc:
101
- def eliminate_raw_string
102
- XMLDecl.new(@version, @encoding, @standalone)
103
- end
104
- end
105
-
106
- class DocType # :nodoc:
107
- def eliminate_raw_string
108
- DocType.new(@root_element_name, @public_identifier, @system_identifier)
109
- end
110
- end
111
-
112
- class ProcIns # :nodoc:
113
- def eliminate_raw_string
114
- ProcIns.new(@target, @content)
115
- end
116
- end
117
-
118
- class Comment # :nodoc:
119
- def eliminate_raw_string
120
- Comment.new(@content)
121
- end
122
- end
123
- end
124
- # :startdoc:
@@ -1,15 +0,0 @@
1
- # :stopdoc:
2
- class Regexp # :nodoc:
3
- def disable_capture
4
- re = ''
5
- self.source.scan(/\\.|[^\\\(]+|\(\?|\(/m) {|s|
6
- if s == '('
7
- re << '(?:'
8
- else
9
- re << s
10
- end
11
- }
12
- Regexp.new(re, self.options, self.kcode)
13
- end
14
- end
15
- # :startdoc:
@@ -1,130 +0,0 @@
1
- # = REXML Tree Generator
2
- #
3
- # HTree::Node#to_rexml is used for converting HTree to REXML.
4
- #
5
- # == Method Summary
6
- #
7
- # - HTree::Node#to_rexml -> REXML::Child
8
- #
9
- # == Example
10
- #
11
- # HTree.parse(...).to_rexml #=> REXML::Document
12
- #
13
- # == Comparison between HTree and REXML.
14
- #
15
- # - HTree parser is permissive HTML/XML parser.
16
- # REXML parser is strict XML parser.
17
- # HTree is recommended if you need to parse realworld HTML.
18
- # REXML is recommended if you need strict error checking.
19
- # - HTree object is immutable.
20
- # REXML object is mutable.
21
- # REXML should be used if you need modification.
22
- #
23
-
24
- # :stopdoc:
25
- require 'htree/modules'
26
- require 'htree/output' # HTree::DocType#generate_content
27
-
28
- module HTree # :nodoc:
29
- module Node # :nodoc:
30
- # convert to REXML tree.
31
- def to_rexml
32
- require 'rexml/document'
33
- to_rexml_internal(nil, DefaultContext)
34
- end
35
- end
36
-
37
- class Doc # :nodoc:
38
- def to_rexml_internal(parent, context)
39
- raise ArgumentError, "parent must be nil" if parent != nil
40
- result = REXML::Document.new
41
- self.children.each {|c|
42
- c.to_rexml_internal(result, context)
43
- }
44
- result
45
- end
46
- end
47
-
48
- class Elem # :nodoc:
49
- def to_rexml_internal(parent, context)
50
- ename = self.element_name
51
- ns_decl = {}
52
- if context.namespace_uri(ename.namespace_prefix) != ename.namespace_uri
53
- ns_decl[ename.namespace_prefix] = ename.namespace_uri
54
- end
55
-
56
- if ename.namespace_prefix
57
- result = REXML::Element.new("#{ename.namespace_prefix}:#{ename.local_name}", parent)
58
- else
59
- result = REXML::Element.new(ename.local_name, parent)
60
- end
61
-
62
- self.each_attribute {|aname, atext|
63
- if aname.namespace_prefix
64
- if context.namespace_uri(aname.namespace_prefix) != aname.namespace_uri
65
- ns_decl[aname.namespace_prefix] = aname.namespace_uri
66
- end
67
- result.add_attribute("#{aname.namespace_prefix}:#{aname.local_name}", atext.to_s)
68
- else
69
- result.add_attribute(aname.local_name, atext.to_s)
70
- end
71
- }
72
-
73
- ns_decl.each {|k, v|
74
- if k
75
- result.add_namespace(k, v)
76
- else
77
- result.add_namespace(v)
78
- end
79
- }
80
- context = context.subst_namespaces(ns_decl)
81
-
82
- self.children.each {|c|
83
- c.to_rexml_internal(result, context)
84
- }
85
- result
86
- end
87
- end
88
-
89
- class Text # :nodoc:
90
- def to_rexml_internal(parent, context)
91
- rcdata = self.rcdata.gsub(/[<>]/) { Encoder::ChRef[$&] }
92
- REXML::Text.new(rcdata, true, parent, true)
93
- end
94
- end
95
-
96
- class XMLDecl # :nodoc:
97
- def to_rexml_internal(parent, context)
98
- r = REXML::XMLDecl.new(self.version, self.encoding, self.standalone)
99
- parent << r if parent
100
- r
101
- end
102
- end
103
-
104
- class DocType # :nodoc:
105
- def to_rexml_internal(parent, context)
106
- REXML::DocType.new([self.root_element_name, self.generate_content], parent)
107
- end
108
- end
109
-
110
- class ProcIns # :nodoc:
111
- def to_rexml_internal(parent, context)
112
- r = REXML::Instruction.new(self.target, self.content)
113
- parent << r if parent
114
- r
115
- end
116
- end
117
-
118
- class Comment # :nodoc:
119
- def to_rexml_internal(parent, context)
120
- REXML::Comment.new(self.content, parent)
121
- end
122
- end
123
-
124
- class BogusETag # :nodoc:
125
- def to_rexml_internal(parent, context)
126
- nil
127
- end
128
- end
129
- end
130
- # :startdoc:
@@ -1,166 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/htmlinfo'
3
- require 'htree/regexp-util'
4
- require 'htree/fstr'
5
-
6
- module HTree # :nodoc:
7
- module Pat # :nodoc:
8
- NameChar = /[-A-Za-z0-9._:]/
9
- Name = /[A-Za-z_:]#{NameChar}*/
10
- Nmtoken = /#{NameChar}+/
11
-
12
- Comment_C = /<!--(.*?)-->/m
13
- Comment = Comment_C.disable_capture
14
- CDATA_C = /<!\[CDATA\[(.*?)\]\]>/m
15
- CDATA = CDATA_C.disable_capture
16
-
17
- QuotedAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)')/
18
- QuotedAttr = QuotedAttr_C.disable_capture
19
- ValidAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)'|(#{NameChar}*))|(#{Nmtoken})/
20
- ValidAttr = ValidAttr_C.disable_capture
21
- InvalidAttr1_C = /(#{Name})\s*=\s*(?:'([^'<>]*)'|"([^"<>]*)"|([^\s<>"']*))|(#{Nmtoken})/
22
- InvalidAttr1 = InvalidAttr1_C.disable_capture
23
- InvalidAttr1End_C = /(#{Name})(?:\s*=\s*(?:'([^'<>]*)|"([^"<>]*)))/
24
- InvalidAttr1End = InvalidAttr1End_C.disable_capture
25
-
26
- QuotedStartTag_C = /<(#{Name})((?:\s+#{QuotedAttr})*)\s*>/
27
- QuotedStartTag = QuotedStartTag_C.disable_capture
28
- ValidStartTag_C = /<(#{Name})((?:\s+#{ValidAttr})*)\s*>/
29
- ValidStartTag = ValidStartTag_C.disable_capture
30
- InvalidStartTag_C = /<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*>/
31
- InvalidStartTag = InvalidStartTag_C.disable_capture
32
- StartTag = /#{QuotedStartTag}|#{ValidStartTag}|#{InvalidStartTag}/
33
-
34
- QuotedEmptyTag_C = %r{<(#{Name})((?:\s+#{QuotedAttr})*)\s*/>}
35
- QuotedEmptyTag = QuotedEmptyTag_C.disable_capture
36
- ValidEmptyTag_C = %r{<(#{Name})((?:\s+#{ValidAttr})*)\s*/>}
37
- ValidEmptyTag = ValidEmptyTag_C.disable_capture
38
- InvalidEmptyTag_C = %r{<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*/>}
39
- InvalidEmptyTag = InvalidEmptyTag_C.disable_capture
40
- EmptyTag = /#{QuotedEmptyTag}|#{ValidEmptyTag}|#{InvalidEmptyTag}/
41
-
42
- EndTag_C = %r{</(#{Name})\s*>}
43
- EndTag = EndTag_C.disable_capture
44
-
45
- XmlVersionNum = /[a-zA-Z0-9_.:-]+/
46
- XmlVersionInfo_C = /\s+version\s*=\s*(?:'(#{XmlVersionNum})'|"(#{XmlVersionNum})")/
47
- XmlVersionInfo = XmlVersionInfo_C.disable_capture
48
- XmlEncName = /[A-Za-z][A-Za-z0-9._-]*/
49
- XmlEncodingDecl_C = /\s+encoding\s*=\s*(?:"(#{XmlEncName})"|'(#{XmlEncName})')/
50
- XmlEncodingDecl = XmlEncodingDecl_C.disable_capture
51
- XmlSDDecl_C = /\s+standalone\s*=\s*(?:'(yes|no)'|"(yes|no)")/
52
- XmlSDDecl = XmlSDDecl_C.disable_capture
53
- XmlDecl_C = /<\?xml#{XmlVersionInfo_C}#{XmlEncodingDecl_C}?#{XmlSDDecl_C}?\s*\?>/
54
- XmlDecl = /<\?xml#{XmlVersionInfo}#{XmlEncodingDecl}?#{XmlSDDecl}?\s*\?>/
55
-
56
- # xxx: internal DTD subset is not recognized: '[' (markupdecl | DeclSep)* ']' S?)?
57
- SystemLiteral_C = /"([^"]*)"|'([^']*)'/
58
- PubidLiteral_C = %r{"([\sa-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*)"|'([\sa-zA-Z0-9\-()+,./:=?;!*\#@$_%]*)'}
59
- ExternalID_C = /(?:SYSTEM|PUBLIC\s+#{PubidLiteral_C})(?:\s+#{SystemLiteral_C})?/
60
- DocType_C = /<!DOCTYPE\s+(#{Name})(?:\s+#{ExternalID_C})?\s*(?:\[.*?\]\s*)?>/m
61
- DocType = DocType_C.disable_capture
62
-
63
- XmlProcIns_C = /<\?(#{Name})(?:\s+(.*?))?\?>/m
64
- XmlProcIns = XmlProcIns_C.disable_capture
65
- #ProcIns = /<\?([^>]*)>/m
66
- end
67
-
68
- def HTree.scan(input, is_xml=false)
69
- is_html = false
70
- cdata_content = nil
71
- text_start = 0
72
- first_element = true
73
- index_xmldecl = 1
74
- index_doctype = 2
75
- index_xmlprocins = 3
76
- index_quotedstarttag = 4
77
- index_quotedemptytag = 5
78
- index_starttag = 6
79
- index_endtag = 7
80
- index_emptytag = 8
81
- index_comment = 9
82
- index_cdata = 10
83
- input.scan(/(#{Pat::XmlDecl})
84
- |(#{Pat::DocType})
85
- |(#{Pat::XmlProcIns})
86
- |(#{Pat::QuotedStartTag})
87
- |(#{Pat::QuotedEmptyTag})
88
- |(#{Pat::StartTag})
89
- |(#{Pat::EndTag})
90
- |(#{Pat::EmptyTag})
91
- |(#{Pat::Comment})
92
- |(#{Pat::CDATA})
93
- /ox) {
94
- match = $~
95
- if cdata_content
96
- str = $&
97
- if match.begin(index_endtag) && str[Pat::Name] == cdata_content
98
- text_end = match.begin(0)
99
- if text_start < text_end
100
- yield [:text_cdata_content, HTree.frozen_string(input[text_start...text_end])]
101
- end
102
- yield [:etag, HTree.frozen_string(str)]
103
- text_start = match.end(0)
104
- cdata_content = nil
105
- end
106
- else
107
- str = match[0]
108
- text_end = match.begin(0)
109
- if text_start < text_end
110
- yield [:text_pcdata, HTree.frozen_string(input[text_start...text_end])]
111
- end
112
- text_start = match.end(0)
113
- if match.begin(index_xmldecl)
114
- yield [:xmldecl, HTree.frozen_string(str)]
115
- is_xml = true
116
- elsif match.begin(index_doctype)
117
- Pat::DocType_C =~ str
118
- root_element_name = $1
119
- public_identifier = $2 || $3
120
- system_identifier = $4 || $5
121
- is_html = true if /\Ahtml\z/i =~ root_element_name
122
- is_xml = true if public_identifier && %r{\A-//W3C//DTD XHTML } =~ public_identifier
123
- yield [:doctype, HTree.frozen_string(str)]
124
- elsif match.begin(index_xmlprocins)
125
- yield [:procins, HTree.frozen_string(str)]
126
- elsif match.begin(index_starttag) || match.begin(index_quotedstarttag)
127
- yield stag = [:stag, HTree.frozen_string(str)]
128
- tagname = str[Pat::Name]
129
- if first_element
130
- if /\A(?:html|head|title|isindex|base|script|style|meta|link|object)\z/i =~ tagname
131
- is_html = true
132
- else
133
- is_xml = true
134
- end
135
- first_element = false
136
- end
137
- if !is_xml && ElementContent[tagname] == :CDATA
138
- cdata_content = tagname
139
- end
140
- elsif match.begin(index_endtag)
141
- yield [:etag, HTree.frozen_string(str)]
142
- elsif match.begin(index_emptytag) || match.begin(index_quotedemptytag)
143
- yield [:emptytag, HTree.frozen_string(str)]
144
- first_element = false
145
- #is_xml = true
146
- elsif match.begin(index_comment)
147
- yield [:comment, HTree.frozen_string(str)]
148
- elsif match.begin(index_cdata)
149
- yield [:text_cdata_section, HTree.frozen_string(str)]
150
- else
151
- raise Exception, "unknown match [bug]"
152
- end
153
- end
154
- }
155
- text_end = input.length
156
- if text_start < text_end
157
- if cdata_content
158
- yield [:text_cdata_content, HTree.frozen_string(input[text_start...text_end])]
159
- else
160
- yield [:text_pcdata, HTree.frozen_string(input[text_start...text_end])]
161
- end
162
- end
163
- return is_xml, is_html
164
- end
165
- end
166
- # :startdoc:
@@ -1,111 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/raw_string'
3
- require 'htree/text'
4
- require 'htree/scan' # for Pat::Name and Pat::Nmtoken
5
- require 'htree/context'
6
- require 'htree/name'
7
- require 'htree/fstr'
8
-
9
- module HTree # :nodoc:
10
- class STag # :nodoc:
11
- def initialize(name, attributes=[], inherited_context=DefaultContext)
12
- init_raw_string
13
- # normalize xml declaration name and attribute value.
14
- attributes = attributes.map {|aname, val|
15
- if !(Name === aname) && /\A(?:#{Pat::Name}?\{.*\})?#{Pat::Nmtoken}\z/o !~ aname
16
- raise HTree::Error, "invalid attribute name: #{aname.inspect}"
17
- end
18
- if !(Name === aname) && /\Axmlns(?:\z|:)/ =~ aname
19
- aname = Name.parse_attribute_name(aname, nil)
20
- end
21
- val = val.to_node if HTree::Location === val
22
- val = Text.new(val) unless Text === val
23
- [aname, val]
24
- }
25
-
26
- @inherited_context = inherited_context
27
- @xmlns_decls = {}
28
-
29
- # validate namespace consistency of given Name objects.
30
- if Name === name
31
- @xmlns_decls[name.namespace_prefix] = name.namespace_uri
32
- end
33
- attributes.each {|aname, text|
34
- next unless Name === aname
35
- next if aname.xmlns?
36
- if aname.namespace_prefix && aname.namespace_uri
37
- if @xmlns_decls.include? aname.namespace_prefix
38
- if @xmlns_decls[aname.namespace_prefix] != aname.namespace_uri
39
- raise ArgumentError, "inconsistent namespace use: #{aname.namespace_prefix} is used as #{@xmlns_decls[aname.namespace_prefix]} and #{aname.namespace_uri}"
40
- end
41
- else
42
- @xmlns_decls[aname.namespace_prefix] = aname.namespace_uri
43
- end
44
- end
45
- }
46
-
47
- attributes.each {|aname, text|
48
- next unless Name === aname
49
- next unless aname.xmlns?
50
- next if @xmlns_decls.include? aname.local_name
51
- if aname.local_name
52
- @xmlns_decls[aname.local_name] = text.to_s
53
- else
54
- uri = text.to_s
55
- @xmlns_decls[nil] = uri
56
- end
57
- }
58
-
59
- @context = make_context(@inherited_context)
60
-
61
- if Name === name
62
- @name = name
63
- else
64
- @name = Name.parse_element_name(name, @context)
65
- end
66
-
67
- @attributes = attributes.map {|aname, text|
68
- aname = Name.parse_attribute_name(aname, @context) unless Name === aname
69
- if !aname.namespace_prefix && !aname.namespace_uri.empty?
70
- # xxx: should recover error?
71
- raise HTree::Error, "global attribute without namespace prefix: #{aname.inspect}"
72
- end
73
- [aname, text]
74
- }
75
- @attributes.freeze
76
- end
77
- attr_reader :attributes, :inherited_context, :context
78
-
79
- def element_name
80
- @name
81
- end
82
-
83
- def make_context(inherited_context)
84
- inherited_context.subst_namespaces(@xmlns_decls)
85
- end
86
-
87
- def each_namespace_attribute
88
- @xmlns_decls.each {|name, uri|
89
- yield name, uri
90
- }
91
- nil
92
- end
93
-
94
- def each_attribute
95
- @attributes.each {|name, text|
96
- next if name.xmlns?
97
- yield name, text
98
- }
99
- nil
100
- end
101
- end
102
-
103
- class ETag # :nodoc:
104
- def initialize(qualified_name)
105
- init_raw_string
106
- @qualified_name = HTree.frozen_string(qualified_name)
107
- end
108
- attr_reader :qualified_name
109
- end
110
- end
111
- # :startdoc: