rubyjedi-oga 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,25 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # Parser for processing HTML input. This parser is a small wrapper around
5
+ # {Oga::XML::Parser} and takes care of setting the various options required
6
+ # for parsing HTML documents.
7
+ #
8
+ # A basic example:
9
+ #
10
+ # Oga::HTML::Parser.new('<meta charset="utf-8">').parse
11
+ #
12
+ class Parser < XML::Parser
13
+ ##
14
+ # @param [String|IO] data
15
+ # @param [Hash] options
16
+ # @see [Oga::XML::Parser#initialize]
17
+ #
18
+ def initialize(data, options = {})
19
+ options = options.merge(:html => true)
20
+
21
+ super(data, options)
22
+ end
23
+ end # Parser
24
+ end # HTML
25
+ end # Oga
@@ -0,0 +1,18 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # SAX parser for HTML documents. See the documentation of
5
+ # {Oga::XML::SaxParser} for more information.
6
+ #
7
+ class SaxParser < XML::SaxParser
8
+ ##
9
+ # @see [Oga::XML::SaxParser#initialize]
10
+ #
11
+ def initialize(handler, data, options = {})
12
+ options = options.merge(:html => true)
13
+
14
+ super(handler, data, options)
15
+ end
16
+ end # SaxParser
17
+ end # HTML
18
+ end # Oga
@@ -0,0 +1,160 @@
1
+ module Oga
2
+ ##
3
+ # Thread-safe LRU cache using a Hash as the underlying storage engine.
4
+ # Whenever the size of the cache exceeds the given limit the oldest keys are
5
+ # removed (base on insert order).
6
+ #
7
+ # This class uses its own list of keys (as returned by {LRU#keys}) instead of
8
+ # relying on `Hash#keys` as the latter allocates a new Array upon every call.
9
+ #
10
+ # This class doesn't use MonitorMixin due to the extra overhead it adds
11
+ # compared to using a Mutex directly.
12
+ #
13
+ # Example usage:
14
+ #
15
+ # cache = LRU.new(3)
16
+ #
17
+ # cache[:a] = 10
18
+ # cache[:b] = 20
19
+ # cache[:c] = 30
20
+ # cache[:d] = 40
21
+ #
22
+ # cache.keys # => [:b, :c, :d]
23
+ #
24
+ # @api private
25
+ #
26
+ class LRU
27
+ ##
28
+ # @param [Fixnum] maximum
29
+ #
30
+ def initialize(maximum = 1024)
31
+ @maximum = maximum
32
+ @cache = {}
33
+ @keys = []
34
+ @mutex = Mutex.new
35
+ @owner = Thread.current
36
+ end
37
+
38
+ ##
39
+ # @param [Fixnum] value
40
+ #
41
+ def maximum=(value)
42
+ synchronize do
43
+ @maximum = value
44
+
45
+ resize
46
+ end
47
+ end
48
+
49
+ ##
50
+ # @return [Fixnum]
51
+ #
52
+ def maximum
53
+ synchronize { @maximum }
54
+ end
55
+
56
+ ##
57
+ # Returns the value of the key.
58
+ #
59
+ # @param [Mixed] key
60
+ # @return [Mixed]
61
+ #
62
+ def [](key)
63
+ synchronize { @cache[key] }
64
+ end
65
+
66
+ ##
67
+ # Sets the key and its value. Old keys are discarded if the LRU size exceeds
68
+ # the limit.
69
+ #
70
+ # @param [Mixed] key
71
+ # @param [Mixed] value
72
+ #
73
+ def []=(key, value)
74
+ synchronize do
75
+ @cache[key] = value
76
+
77
+ @keys.delete(key) if @keys.include?(key)
78
+
79
+ @keys << key
80
+
81
+ resize
82
+ end
83
+ end
84
+
85
+ ##
86
+ # Returns a key if it exists, otherwise yields the supplied block and uses
87
+ # its return value as the key value.
88
+ #
89
+ # @param [Mixed] key
90
+ # @return [Mixed]
91
+ #
92
+ def get_or_set(key)
93
+ synchronize { self[key] ||= yield }
94
+ end
95
+
96
+ ##
97
+ # @return [Array]
98
+ #
99
+ def keys
100
+ synchronize { @keys }
101
+ end
102
+
103
+ ##
104
+ # @param [Mixed] key
105
+ # @return [TrueClass|FalseClass]
106
+ #
107
+ def key?(key)
108
+ synchronize { @cache.key?(key) }
109
+ end
110
+
111
+ ##
112
+ # Removes all keys from the cache.
113
+ #
114
+ def clear
115
+ synchronize do
116
+ @keys.clear
117
+ @cache.clear
118
+ end
119
+ end
120
+
121
+ ##
122
+ # @return [Fixnum]
123
+ #
124
+ def size
125
+ synchronize { @cache.size }
126
+ end
127
+
128
+ alias_method :length, :size
129
+
130
+ private
131
+
132
+ ##
133
+ # Yields the supplied block in a synchronized manner (if needed). This
134
+ # method is heavily based on `MonitorMixin#mon_enter`.
135
+ #
136
+ def synchronize
137
+ if @owner != Thread.current
138
+ @mutex.synchronize do
139
+ @owner = Thread.current
140
+
141
+ yield
142
+ end
143
+ else
144
+ yield
145
+ end
146
+ end
147
+
148
+ ##
149
+ # Removes old keys until the size of the hash no longer exceeds the maximum
150
+ # size.
151
+ #
152
+ def resize
153
+ return unless size > @maximum
154
+
155
+ to_remove = @keys.shift(size - @maximum)
156
+
157
+ to_remove.each { |key| @cache.delete(key) }
158
+ end
159
+ end # LRU
160
+ end # Oga
@@ -0,0 +1,57 @@
1
+ module Oga
2
+ ##
3
+ # Parses the given XML document.
4
+ #
5
+ # @example
6
+ # document = Oga.parse_xml('<root>Hello</root>')
7
+ #
8
+ # @see [Oga::XML::Lexer#initialize]
9
+ #
10
+ # @return [Oga::XML::Document]
11
+ #
12
+ def self.parse_xml(xml, options = {})
13
+ XML::Parser.new(xml, options).parse
14
+ end
15
+
16
+ ##
17
+ # Parses the given HTML document.
18
+ #
19
+ # @example
20
+ # document = Oga.parse_html('<html>...</html>')
21
+ #
22
+ # @see [Oga::XML::Lexer#initialize]
23
+ #
24
+ # @return [Oga::XML::Document]
25
+ #
26
+ def self.parse_html(html, options = {})
27
+ HTML::Parser.new(html, options).parse
28
+ end
29
+
30
+ ##
31
+ # Parses the given XML document using the SAX parser.
32
+ #
33
+ # @example
34
+ # handler = SomeSaxHandler.new
35
+ #
36
+ # Oga.sax_parse_html(handler, '<root>Hello</root>')
37
+ #
38
+ # @see [Oga::XML::SaxParser#initialize]
39
+ #
40
+ def self.sax_parse_xml(handler, xml, options = {})
41
+ XML::SaxParser.new(handler, xml, options).parse
42
+ end
43
+
44
+ ##
45
+ # Parses the given HTML document using the SAX parser.
46
+ #
47
+ # @example
48
+ # handler = SomeSaxHandler.new
49
+ #
50
+ # Oga.sax_parse_html(handler, '<script>foo()</script>')
51
+ #
52
+ # @see [Oga::XML::SaxParser#initialize]
53
+ #
54
+ def self.sax_parse_html(handler, html, options = {})
55
+ HTML::SaxParser.new(handler, html, options).parse
56
+ end
57
+ end # Oga
@@ -0,0 +1,3 @@
1
+ module Oga
2
+ VERSION = '1.0.3'
3
+ end # Oga
@@ -0,0 +1,20 @@
1
+ module Oga
2
+ ##
3
+ # @api private
4
+ #
5
+ class Whitelist < Blacklist
6
+ ##
7
+ # @return [TrueClass|FalseClass]
8
+ #
9
+ def allow?(name)
10
+ names.include?(name)
11
+ end
12
+
13
+ ##
14
+ # @return [Oga::Blacklist]
15
+ #
16
+ def to_blacklist
17
+ Blacklist.new(names)
18
+ end
19
+ end # Whitelist
20
+ end # Oga
@@ -0,0 +1,136 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class for storing information about a single XML attribute.
5
+ #
6
+ class Attribute
7
+ # The name of the attribute.
8
+ # @return [String]
9
+ attr_accessor :name
10
+
11
+ # @return [String]
12
+ attr_accessor :namespace_name
13
+
14
+ # The element this attribute belongs to.
15
+ # @return [Oga::XML::Element]
16
+ attr_accessor :element
17
+
18
+ ##
19
+ # The default namespace available to all attributes. This namespace can
20
+ # not be modified.
21
+ #
22
+ # @return [Oga::XML::Namespace]
23
+ #
24
+ DEFAULT_NAMESPACE = Namespace.new(
25
+ :name => 'xml',
26
+ :uri => XML::DEFAULT_NAMESPACE.uri
27
+ ).freeze
28
+
29
+ ##
30
+ # @param [Hash] options
31
+ #
32
+ # @option options [String] :name
33
+ # @option options [String] :namespace_name
34
+ # @option options [String] :value
35
+ # @option options [Oga::XML::Element] :element
36
+ #
37
+ def initialize(options = {})
38
+ @name = options[:name]
39
+ @value = options[:value]
40
+ @element = options[:element]
41
+
42
+ @namespace_name = options[:namespace_name]
43
+ end
44
+
45
+ ##
46
+ # Returns the {Oga::XML::Namespace} instance for the current namespace
47
+ # name.
48
+ #
49
+ # @return [Oga::XML::Namespace]
50
+ #
51
+ def namespace
52
+ unless @namespace
53
+ if namespace_name == DEFAULT_NAMESPACE.name
54
+ @namespace = DEFAULT_NAMESPACE
55
+ else
56
+ @namespace = element.available_namespaces[namespace_name]
57
+ end
58
+ end
59
+
60
+ @namespace
61
+ end
62
+
63
+ ##
64
+ # @param [String] value
65
+ #
66
+ def value=(value)
67
+ @value = value
68
+ @decoded = false
69
+ end
70
+
71
+ ##
72
+ # Returns the value of the attribute or nil if no explicit value was set.
73
+ #
74
+ # @return [String|NilClass]
75
+ #
76
+ def value
77
+ if !@decoded and @value
78
+ @value = EntityDecoder.try_decode(@value, html?)
79
+ @decoded = true
80
+ end
81
+
82
+ @value
83
+ end
84
+
85
+ ##
86
+ # @return [String]
87
+ #
88
+ def text
89
+ value.to_s
90
+ end
91
+
92
+ alias_method :to_s, :text
93
+
94
+ ##
95
+ # @return [String]
96
+ #
97
+ def to_xml
98
+ if namespace_name
99
+ full_name = "#{namespace_name}:#{name}"
100
+ else
101
+ full_name = name
102
+ end
103
+
104
+ enc_value = value ? Entities.encode_attribute(value) : nil
105
+
106
+ %Q(#{full_name}="#{enc_value}")
107
+ end
108
+
109
+ ##
110
+ # @return [String]
111
+ #
112
+ def inspect
113
+ segments = []
114
+
115
+ [:name, :namespace, :value].each do |attr|
116
+ value = send(attr)
117
+
118
+ if value
119
+ segments << "#{attr}: #{value.inspect}"
120
+ end
121
+ end
122
+
123
+ "Attribute(#{segments.join(' ')})"
124
+ end
125
+
126
+ private
127
+
128
+ ##
129
+ # @return [TrueClass|FalseClass]
130
+ #
131
+ def html?
132
+ !!@element && @element.html?
133
+ end
134
+ end # Attribute
135
+ end # XML
136
+ end # Oga
@@ -0,0 +1,17 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about CDATA tags.
5
+ #
6
+ class Cdata < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ "<![CDATA[#{text}]]>"
14
+ end
15
+ end # Cdata
16
+ end # XML
17
+ end # Oga
@@ -0,0 +1,37 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Base class for nodes that represent a text-like value such as Text and
5
+ # Comment nodes.
6
+ #
7
+ class CharacterNode < Node
8
+ # @return [String]
9
+ attr_accessor :text
10
+
11
+ ##
12
+ # @param [Hash] options
13
+ #
14
+ # @option options [String] :text The text of the node.
15
+ #
16
+ def initialize(options = {})
17
+ super
18
+
19
+ @text = options[:text]
20
+ end
21
+
22
+ ##
23
+ # @return [String]
24
+ #
25
+ def to_xml
26
+ text.to_s
27
+ end
28
+
29
+ ##
30
+ # @return [String]
31
+ #
32
+ def inspect
33
+ "#{self.class.to_s.split('::').last}(#{text.inspect})"
34
+ end
35
+ end # CharacterNode
36
+ end # XML
37
+ end # Oga