feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -1,115 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/modules'
3
- require 'htree/raw_string'
4
- require 'htree/htmlinfo'
5
- require 'htree/encoder'
6
- require 'htree/fstr'
7
- require 'iconv'
8
-
9
- module HTree # :nodoc:
10
- class Text # :nodoc:
11
- class << self
12
- alias new_internal new
13
- end
14
-
15
- def Text.new(arg)
16
- arg = arg.to_node if HTree::Location === arg
17
- if Text === arg
18
- new_internal arg.rcdata, arg.normalized_rcdata
19
- elsif String === arg
20
- arg2 = arg.gsub(/&/, '&amp;')
21
- arg = arg2.freeze if arg != arg2
22
- new_internal arg
23
- else
24
- raise TypeError, "cannot initialize Text with #{arg.inspect}"
25
- end
26
- end
27
-
28
- def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
29
- init_raw_string
30
- @rcdata = rcdata && HTree.frozen_string(rcdata)
31
- @normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
32
- end
33
- attr_reader :rcdata, :normalized_rcdata
34
-
35
- def internal_normalize(rcdata)
36
- # - character references are decoded as much as possible.
37
- # - undecodable character references are converted to decimal numeric character refereces.
38
- result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
39
- u = nil
40
- if $1
41
- u = $1.to_i
42
- elsif $2
43
- u = $2.hex
44
- elsif $3
45
- u = NamedCharacters[$3]
46
- end
47
- if !u || u < 0 || 0x7fffffff < u
48
- '?'
49
- elsif u == 38 # '&' character.
50
- '&#38;'
51
- elsif u <= 0x7f
52
- [u].pack("C")
53
- else
54
- begin
55
- Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
56
- rescue Iconv::Failure
57
- "&##{u};"
58
- end
59
- end
60
- }
61
- HTree.frozen_string(result)
62
- end
63
- private :internal_normalize
64
-
65
- # HTree::Text#to_s converts the text to a string.
66
- # - character references are decoded as much as possible.
67
- # - undecodable character reference are converted to `?' character.
68
- def to_s
69
- @normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
70
- u = $1.to_i
71
- if 0 <= u && u <= 0x7f
72
- [u].pack("C")
73
- else
74
- '?'
75
- end
76
- }
77
- end
78
-
79
- def empty?
80
- @normalized_rcdata.empty?
81
- end
82
-
83
- def strip
84
- rcdata = @normalized_rcdata.dup
85
- rcdata.sub!(/\A(?:\s|&nbsp;)+/, '')
86
- rcdata.sub!(/(?:\s|&nbsp;)+\z/, '')
87
- if rcdata == @normalized_rcdata
88
- self
89
- else
90
- rcdata.freeze
91
- Text.new_internal(rcdata, rcdata)
92
- end
93
- end
94
-
95
- # HTree::Text.concat returns a text which is concatenation of arguments.
96
- #
97
- # An argument should be one of follows.
98
- # - String
99
- # - HTree::Text
100
- # - HTree::Location which points HTree::Text
101
- def Text.concat(*args)
102
- rcdata = ''
103
- args.each {|arg|
104
- arg = arg.to_node if HTree::Location === arg
105
- if Text === arg
106
- rcdata << arg.rcdata
107
- else
108
- rcdata << arg.gsub(/&/, '&amp;')
109
- end
110
- }
111
- new_internal rcdata
112
- end
113
- end
114
- end
115
- # :startdoc:
@@ -1,465 +0,0 @@
1
- # :stopdoc:
2
- require 'htree/doc'
3
- require 'htree/elem'
4
- require 'htree/loc'
5
- require 'htree/extract_text'
6
- require 'uri'
7
-
8
- module HTree # :nodoc:
9
- module Traverse # :nodoc:
10
- def doc?() Doc::Trav === self end
11
- def elem?() Elem::Trav === self end
12
- def text?() Text::Trav === self end
13
- def xmldecl?() XMLDecl::Trav === self end
14
- def doctype?() DocType::Trav === self end
15
- def procins?() ProcIns::Trav === self end
16
- def comment?() Comment::Trav === self end
17
- def bogusetag?() BogusETag::Trav === self end
18
-
19
- def get_subnode(*indexes)
20
- n = self
21
- indexes.each {|index|
22
- n = n.get_subnode_internal(index)
23
- }
24
- n
25
- end
26
- end
27
-
28
- module Container::Trav # :nodoc:
29
- # +each_child+ iterates over each child.
30
- def each_child(&block) # :yields: child_node
31
- children.each(&block)
32
- nil
33
- end
34
-
35
- # +each_child_with_index+ iterates over each child.
36
- def each_child_with_index(&block) # :yields: child_node, index
37
- children.each_with_index(&block)
38
- nil
39
- end
40
-
41
- # +find_element+ searches an element which universal name is specified by
42
- # the arguments.
43
- # It returns nil if not found.
44
- def find_element(*names)
45
- traverse_element(*names) {|e| return e }
46
- nil
47
- end
48
-
49
- # +traverse_element+ traverses elements in the tree.
50
- # It yields elements in depth first order.
51
- #
52
- # If _names_ are empty, it yields all elements.
53
- # If non-empty _names_ are given, it should be list of universal names.
54
- #
55
- # A nested element is yielded in depth first order as follows.
56
- #
57
- # t = HTree('<a id=0><b><a id=1 /></b><c id=2 /></a>')
58
- # t.traverse_element("a", "c") {|e| p e}
59
- # # =>
60
- # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
61
- # {emptyelem <a id="1">}
62
- # {emptyelem <c id="2">}
63
- #
64
- # Universal names are specified as follows.
65
- #
66
- # t = HTree(<<'End')
67
- # <html>
68
- # <meta name="robots" content="index,nofollow">
69
- # <meta name="author" content="Who am I?">
70
- # </html>
71
- # End
72
- # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
73
- # # =>
74
- # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
75
- # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
76
- #
77
- def traverse_element(*names, &block) # :yields: element
78
- if names.empty?
79
- traverse_all_element(&block)
80
- else
81
- name_set = {}
82
- names.each {|n| name_set[n] = true }
83
- traverse_some_element(name_set, &block)
84
- end
85
- nil
86
- end
87
-
88
- def each_hyperlink_attribute
89
- traverse_element(
90
- '{http://www.w3.org/1999/xhtml}a',
91
- '{http://www.w3.org/1999/xhtml}area',
92
- '{http://www.w3.org/1999/xhtml}link',
93
- '{http://www.w3.org/1999/xhtml}img',
94
- '{http://www.w3.org/1999/xhtml}object',
95
- '{http://www.w3.org/1999/xhtml}q',
96
- '{http://www.w3.org/1999/xhtml}blockquote',
97
- '{http://www.w3.org/1999/xhtml}ins',
98
- '{http://www.w3.org/1999/xhtml}del',
99
- '{http://www.w3.org/1999/xhtml}form',
100
- '{http://www.w3.org/1999/xhtml}input',
101
- '{http://www.w3.org/1999/xhtml}head',
102
- '{http://www.w3.org/1999/xhtml}base',
103
- '{http://www.w3.org/1999/xhtml}script') {|elem|
104
- case elem.name
105
- when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
106
- attrs = ['href']
107
- when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
108
- attrs = ['src', 'longdesc', 'usemap']
109
- when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
110
- attrs = ['classid', 'codebase', 'data', 'usemap']
111
- when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
112
- attrs = ['cite']
113
- when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
114
- attrs = ['action']
115
- when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
116
- attrs = ['src', 'usemap']
117
- when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
118
- attrs = ['profile']
119
- when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
120
- attrs = ['src', 'for']
121
- end
122
- attrs.each {|attr|
123
- if hyperlink = elem.get_attribute(attr)
124
- yield elem, attr, hyperlink
125
- end
126
- }
127
- }
128
- end
129
- private :each_hyperlink_attribute
130
-
131
- # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
132
- # of A element.
133
- #
134
- # It yields HTree::Text (or HTree::Loc) and URI for each hyperlink.
135
- #
136
- # The URI objects are created with a base URI which is given by
137
- # HTML BASE element or the argument ((|base_uri|)).
138
- # +each_hyperlink_uri+ doesn't yields href of the BASE element.
139
- def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
140
- base_uri = URI.parse(base_uri) if String === base_uri
141
- links = []
142
- each_hyperlink_attribute {|elem, attr, hyperlink|
143
- if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
144
- base_uri = URI.parse(hyperlink.to_s)
145
- else
146
- links << hyperlink
147
- end
148
- }
149
- if base_uri
150
- links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
151
- else
152
- links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
153
- end
154
- end
155
-
156
- # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
157
- # of A element.
158
- #
159
- # It yields HTree::Text or HTree::Loc.
160
- #
161
- # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
162
- def each_hyperlink # :yields: text
163
- links = []
164
- each_hyperlink_attribute {|elem, attr, hyperlink|
165
- yield hyperlink
166
- }
167
- end
168
-
169
- # +each_uri+ traverses hyperlinks such as HTML href attribute
170
- # of A element.
171
- #
172
- # It yields URI for each hyperlink.
173
- #
174
- # The URI objects are created with a base URI which is given by
175
- # HTML BASE element or the argument ((|base_uri|)).
176
- def each_uri(base_uri=nil) # :yields: URI
177
- each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
178
- end
179
- end
180
-
181
- module Doc::Trav
182
- def traverse_all_element(&block)
183
- children.each {|c| c.traverse_all_element(&block) }
184
- end
185
- end
186
-
187
- module Elem::Trav
188
- def traverse_all_element(&block)
189
- yield self
190
- children.each {|c| c.traverse_all_element(&block) }
191
- end
192
- end
193
-
194
- module Leaf::Trav
195
- def traverse_all_element
196
- end
197
- end
198
-
199
- module Doc::Trav
200
- def traverse_some_element(name_set, &block)
201
- children.each {|c| c.traverse_some_element(name_set, &block) }
202
- end
203
- end
204
-
205
- module Elem::Trav
206
- def traverse_some_element(name_set, &block)
207
- yield self if name_set.include? self.name
208
- children.each {|c| c.traverse_some_element(name_set, &block) }
209
- end
210
- end
211
-
212
- module Leaf::Trav
213
- def traverse_some_element(name_set)
214
- end
215
- end
216
-
217
- module Traverse # :nodoc:
218
- # +traverse_text+ traverses texts in the tree
219
- def traverse_text(&block) # :yields: text
220
- traverse_text_internal(&block)
221
- nil
222
- end
223
- end
224
-
225
- module Container::Trav # :nodoc:
226
- def traverse_text_internal(&block)
227
- each_child {|c| c.traverse_text_internal(&block) }
228
- end
229
- end
230
-
231
- module Leaf::Trav # :nodoc:
232
- def traverse_text_internal
233
- end
234
- end
235
-
236
- module Text::Trav # :nodoc:
237
- def traverse_text_internal
238
- yield self
239
- end
240
- end
241
-
242
- module Container::Trav # :nodoc:
243
- # +filter+ rebuilds the tree without some components.
244
- #
245
- # node.filter {|descendant_node| predicate } -> node
246
- # loc.filter {|descendant_loc| predicate } -> node
247
- #
248
- # +filter+ yields each node except top node.
249
- # If given block returns false, corresponding node is dropped.
250
- # If given block returns true, corresponding node is retained and
251
- # inner nodes are examined.
252
- #
253
- # +filter+ returns an node.
254
- # It doesn't return location object even if self is location object.
255
- #
256
- def filter(&block)
257
- subst = {}
258
- each_child_with_index {|descendant, i|
259
- if yield descendant
260
- if descendant.elem?
261
- subst[i] = descendant.filter(&block)
262
- else
263
- subst[i] = descendant
264
- end
265
- else
266
- subst[i] = nil
267
- end
268
- }
269
- to_node.subst_subnode(subst)
270
- end
271
- end
272
-
273
- module Doc::Trav # :nodoc:
274
- # +title+ searches title and return it as a text.
275
- # It returns nil if not found.
276
- #
277
- # +title+ searchs following information.
278
- #
279
- # - <title>...</title> in HTML
280
- # - <title>...</title> in RSS
281
- def title
282
- e = find_element('title',
283
- '{http://www.w3.org/1999/xhtml}title',
284
- '{http://purl.org/rss/1.0/}title',
285
- '{http://my.netscape.com/rdf/simple/0.9/}title')
286
- e && e.extract_text
287
- end
288
-
289
- # +author+ searches author and return it as a text.
290
- # It returns nil if not found.
291
- #
292
- # +author+ searchs following information.
293
- #
294
- # - <meta name="author" content="author-name"> in HTML
295
- # - <link rev="made" title="author-name"> in HTML
296
- # - <dc:creator>author-name</dc:creator> in RSS
297
- # - <dc:publisher>author-name</dc:publisher> in RSS
298
- def author
299
- traverse_element('meta',
300
- '{http://www.w3.org/1999/xhtml}meta') {|e|
301
- begin
302
- next unless e.fetch_attr('name').downcase == 'author'
303
- author = e.fetch_attribute('content').strip
304
- return author if !author.empty?
305
- rescue IndexError
306
- end
307
- }
308
-
309
- traverse_element('link',
310
- '{http://www.w3.org/1999/xhtml}link') {|e|
311
- begin
312
- next unless e.fetch_attr('rev').downcase == 'made'
313
- author = e.fetch_attribute('title').strip
314
- return author if !author.empty?
315
- rescue IndexError
316
- end
317
- }
318
-
319
- if channel = find_element('{http://purl.org/rss/1.0/}channel')
320
- channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
321
- begin
322
- author = e.extract_text.strip
323
- return author if !author.empty?
324
- rescue IndexError
325
- end
326
- }
327
- channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
328
- begin
329
- author = e.extract_text.strip
330
- return author if !author.empty?
331
- rescue IndexError
332
- end
333
- }
334
- end
335
-
336
- nil
337
- end
338
-
339
- end
340
-
341
- module Doc::Trav # :nodoc:
342
- def root
343
- es = []
344
- children.each {|c| es << c if c.elem? }
345
- raise HTree::Error, "no element" if es.empty?
346
- raise HTree::Error, "multiple top elements" if 1 < es.length
347
- es[0]
348
- end
349
- end
350
-
351
- module Elem::Trav # :nodoc:
352
- # +name+ returns the universal name of the element as a string.
353
- #
354
- # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.name
355
- # # =>
356
- # "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF"
357
- #
358
- def name() element_name.universal_name end
359
-
360
- # +qualified_name+ returns the qualified name of the element as a string.
361
- #
362
- # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.qualified_name
363
- # # =>
364
- # "rdf:RDF"
365
- def qualified_name() element_name.qualified_name end
366
-
367
- # +attributes+ returns attributes as a hash.
368
- # The hash keys are HTree::Name objects.
369
- # The hash values are HTree::Text or HTree::Location objects.
370
- #
371
- # p HTree('<a name="xx" href="uu">').root.attributes
372
- # # =>
373
- # {href=>{text "uu"}, name=>{text "xx"}}
374
- #
375
- # p HTree('<a name="xx" href="uu">').make_loc.root.attributes
376
- # # =>
377
- # {href=>#<HTree::Location: doc()/a/@href>, name=>#<HTree::Location: doc()/a/@name>}
378
- #
379
- def attributes
380
- result = {}
381
- each_attribute {|name, text|
382
- result[name] = text
383
- }
384
- result
385
- end
386
-
387
- def each_attr
388
- each_attribute {|name, text|
389
- uname = name.universal_name
390
- str = text.to_s
391
- yield uname, str
392
- }
393
- end
394
-
395
- # call-seq:
396
- # elem.fetch_attribute(name) -> text or raise IndexError
397
- # elem.fetch_attribute(name, default) -> text or default
398
- # elem.fetch_attribute(name) {|uname| default } -> text or default
399
- #
400
- # +fetch_attribute+ returns an attribute value as a text.
401
- #
402
- # elem may be an instance of HTree::Elem or a location points to it.
403
- def fetch_attribute(uname, *rest, &block)
404
- if 1 < rest.length
405
- raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
406
- end
407
- if !rest.empty? && block_given?
408
- raise ArgumentError, "block supersedes default value argument"
409
- end
410
- uname = uname.universal_name if uname.respond_to? :universal_name
411
- return update_attribute_hash.fetch(uname) {
412
- if block_given?
413
- return yield(uname)
414
- elsif !rest.empty?
415
- return rest[0]
416
- else
417
- raise IndexError, "attribute not found: #{uname.inspect}"
418
- end
419
- }
420
- end
421
-
422
- # call-seq:
423
- # elem.fetch_attr(name) -> string or raise IndexError
424
- # elem.fetch_attr(name, default) -> string or default
425
- # elem.fetch_attr(name) {|uname| default } -> string or default
426
- #
427
- # +fetch_attr+ returns an attribute value as a string.
428
- #
429
- # elem may be an instance of HTree::Elem or a location points to it.
430
- def fetch_attr(uname, *rest, &block)
431
- if 1 < rest.length
432
- raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
433
- end
434
- if !rest.empty? && block_given?
435
- raise ArgumentError, "block supersedes default value argument"
436
- end
437
- uname = uname.universal_name if uname.respond_to? :universal_name
438
- return update_attribute_hash.fetch(uname) {
439
- if block_given?
440
- return yield(uname)
441
- elsif !rest.empty?
442
- return rest[0]
443
- else
444
- raise IndexError, "attribute not found: #{uname.inspect}"
445
- end
446
- }.to_s
447
- end
448
-
449
- def get_attribute(uname)
450
- uname = uname.universal_name if uname.respond_to? :universal_name
451
- update_attribute_hash[uname]
452
- end
453
-
454
- def get_attr(uname)
455
- if text = update_attribute_hash[uname]
456
- text.to_s
457
- else
458
- nil
459
- end
460
- end
461
-
462
- end
463
-
464
- end
465
- # :startdoc: