rubyjedi-oga 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,37 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about a single processing instruction.
5
+ #
6
+ class ProcessingInstruction < CharacterNode
7
+ # @return [String]
8
+ attr_accessor :name
9
+
10
+ ##
11
+ # @param [Hash] options
12
+ #
13
+ # @option options [String] :name The name of the instruction.
14
+ # @see [Oga::XML::CharacterNode#initialize]
15
+ #
16
+ def initialize(options = {})
17
+ super
18
+
19
+ @name = options[:name]
20
+ end
21
+
22
+ ##
23
+ # @return [String]
24
+ #
25
+ def to_xml
26
+ "<?#{name}#{text}?>"
27
+ end
28
+
29
+ ##
30
+ # @return [String]
31
+ #
32
+ def inspect
33
+ "ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"
34
+ end
35
+ end # ProcessingInstruction
36
+ end # XML
37
+ end # Oga
@@ -0,0 +1,175 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The PullParser class can be used to parse an XML document incrementally
5
+ # instead of parsing it as a whole. This results in lower memory usage and
6
+ # potentially faster parsing times. The downside is that pull parsers are
7
+ # typically more difficult to use compared to DOM parsers.
8
+ #
9
+ # Basic parsing using this class works as following:
10
+ #
11
+ # parser = Oga::XML::PullParser.new('... xml here ...')
12
+ #
13
+ # parser.parse do |node|
14
+ # if node.is_a?(Oga::XML::PullParser)
15
+ #
16
+ # end
17
+ # end
18
+ #
19
+ # This parses yields proper XML instances such as {Oga::XML::Element}.
20
+ # Doctypes and XML declarations are ignored by this parser.
21
+ #
22
+ class PullParser < Parser
23
+ # @return [Oga::XML::Node]
24
+ attr_reader :node
25
+
26
+ # Array containing the names of the currently nested elements.
27
+ # @return [Array]
28
+ attr_reader :nesting
29
+
30
+ ##
31
+ # @return [Array]
32
+ #
33
+ DISABLED_CALLBACKS = [
34
+ :on_document,
35
+ :on_doctype,
36
+ :on_xml_decl,
37
+ :on_element_children
38
+ ]
39
+
40
+ ##
41
+ # @return [Array]
42
+ #
43
+ BLOCK_CALLBACKS = [
44
+ :on_cdata,
45
+ :on_comment,
46
+ :on_text,
47
+ :on_proc_ins
48
+ ]
49
+
50
+ ##
51
+ # Returns the shorthands that can be used for various node classes.
52
+ #
53
+ # @return [Hash]
54
+ #
55
+ NODE_SHORTHANDS = {
56
+ :text => XML::Text,
57
+ :node => XML::Node,
58
+ :cdata => XML::Cdata,
59
+ :element => XML::Element,
60
+ :doctype => XML::Doctype,
61
+ :comment => XML::Comment,
62
+ :xml_declaration => XML::XmlDeclaration
63
+ }
64
+
65
+ ##
66
+ # @see Oga::XML::Parser#reset
67
+ #
68
+ def reset
69
+ super
70
+
71
+ @block = nil
72
+ @nesting = []
73
+ @node = nil
74
+ end
75
+
76
+ ##
77
+ # Parses the input and yields every node to the supplied block.
78
+ #
79
+ # @yieldparam [Oga::XML::Node]
80
+ #
81
+ def parse(&block)
82
+ @block = block
83
+
84
+ super
85
+
86
+ return
87
+ end
88
+
89
+ ##
90
+ # Calls the supplied block if the current node type and optionally the
91
+ # nesting match. This method allows you to write this:
92
+ #
93
+ # parser.parse do |node|
94
+ # parser.on(:text, %w{people person name}) do
95
+ # puts node.text
96
+ # end
97
+ # end
98
+ #
99
+ # Instead of this:
100
+ #
101
+ # parser.parse do |node|
102
+ # if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
103
+ # puts node.text
104
+ # end
105
+ # end
106
+ #
107
+ # When calling this method you can specify the following node types:
108
+ #
109
+ # * `:cdata`
110
+ # * `:comment`
111
+ # * `:element`
112
+ # * `:text`
113
+ #
114
+ # @example
115
+ # parser.on(:element, %w{people person name}) do
116
+ #
117
+ # end
118
+ #
119
+ # @param [Symbol] type The type of node to act upon. This is a symbol as
120
+ # returned by {Oga::XML::Node#node_type}.
121
+ #
122
+ # @param [Array] nesting The element name nesting to act upon.
123
+ #
124
+ def on(type, nesting = [])
125
+ if node.is_a?(NODE_SHORTHANDS[type])
126
+ if nesting.empty? or nesting == self.nesting
127
+ yield
128
+ end
129
+ end
130
+ end
131
+
132
+ # eval is a heck of a lot faster than define_method on both Rubinius and
133
+ # JRuby.
134
+ DISABLED_CALLBACKS.each do |method|
135
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
136
+ def #{method}(*args)
137
+ return
138
+ end
139
+ EOF
140
+ end
141
+
142
+ BLOCK_CALLBACKS.each do |method|
143
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
144
+ def #{method}(*args)
145
+ @node = super
146
+ @block.call(@node)
147
+ return
148
+ end
149
+ EOF
150
+ end
151
+
152
+ ##
153
+ # @see Oga::XML::Parser#on_element
154
+ #
155
+ def on_element(*args)
156
+ @node = super
157
+
158
+ nesting << @node.name
159
+
160
+ @block.call(@node)
161
+
162
+ return
163
+ end
164
+
165
+ ##
166
+ # @see Oga::XML::Parser#on_element_children
167
+ #
168
+ def after_element(*args)
169
+ nesting.pop
170
+
171
+ return
172
+ end
173
+ end # PullParser
174
+ end # XML
175
+ end # Oga
@@ -0,0 +1,56 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The Querying module provides methods that make it easy to run XPath/CSS
5
+ # queries on XML documents/elements.
6
+ #
7
+ module Querying
8
+ ##
9
+ # Evaluates the given XPath expression.
10
+ #
11
+ # @param [String] expression The XPath expression to run.
12
+ # @param [Hash] variables Variables to bind.
13
+ # @see [Oga::XPath::Evaluator#initialize]
14
+ #
15
+ def xpath(expression, variables = {})
16
+ XPath::Evaluator.new(self, variables).evaluate(expression)
17
+ end
18
+
19
+ ##
20
+ # Evaluates the given XPath expression and returns the first node in the
21
+ # set.
22
+ #
23
+ # @see [#xpath]
24
+ #
25
+ def at_xpath(*args)
26
+ result = xpath(*args)
27
+
28
+ result.is_a?(XML::NodeSet) ? result.first : result
29
+ end
30
+
31
+ ##
32
+ # Evaluates the given CSS expression.
33
+ #
34
+ # @param [String] expression The CSS expression to run.
35
+ # @see [Oga::XPath::Evaluator#initialize]
36
+ #
37
+ def css(expression)
38
+ ast = CSS::Parser.parse_with_cache(expression)
39
+
40
+ XPath::Evaluator.new(self).evaluate_ast(ast)
41
+ end
42
+
43
+ ##
44
+ # Evaluates the given CSS expression and returns the first node in the
45
+ # set.
46
+ #
47
+ # @see [#css]
48
+ #
49
+ def at_css(*args)
50
+ result = css(*args)
51
+
52
+ result.is_a?(XML::NodeSet) ? result.first : result
53
+ end
54
+ end # Querying
55
+ end # XML
56
+ end # Oga
@@ -0,0 +1,192 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The SaxParser class provides the basic interface for writing custom SAX
5
+ # parsers. All callback methods defined in {Oga::XML::Parser} are delegated
6
+ # to a dedicated handler class.
7
+ #
8
+ # To write a custom handler for the SAX parser, create a class that
9
+ # implements one (or many) of the following callback methods:
10
+ #
11
+ # * `on_document`
12
+ # * `on_doctype`
13
+ # * `on_cdata`
14
+ # * `on_comment`
15
+ # * `on_proc_ins`
16
+ # * `on_xml_decl`
17
+ # * `on_text`
18
+ # * `on_element`
19
+ # * `on_element_children`
20
+ # * `on_attribute`
21
+ # * `on_attributes`
22
+ # * `after_element`
23
+ #
24
+ # For example:
25
+ #
26
+ # class SaxHandler
27
+ # def on_element(namespace, name, attrs = {})
28
+ # puts name
29
+ # end
30
+ # end
31
+ #
32
+ # You can then use it as following:
33
+ #
34
+ # handler = SaxHandler.new
35
+ # parser = Oga::XML::SaxParser.new(handler, '<foo />')
36
+ #
37
+ # parser.parse
38
+ #
39
+ # For information on the callback arguments see the documentation of the
40
+ # corresponding methods in {Oga::XML::Parser}.
41
+ #
42
+ # ## Element Callbacks
43
+ #
44
+ # The SAX parser changes the behaviour of both `on_element` and
45
+ # `after_element`. The latter in the regular parser only takes a
46
+ # {Oga::XML::Element} instance. In the SAX parser it will instead take a
47
+ # namespace name and the element name. This eases the process of figuring
48
+ # out what element a callback is associated with.
49
+ #
50
+ # An example:
51
+ #
52
+ # class SaxHandler
53
+ # def on_element(namespace, name, attrs = {})
54
+ # # ...
55
+ # end
56
+ #
57
+ # def after_element(namespace, name)
58
+ # puts name # => "foo", "bar", etc
59
+ # end
60
+ # end
61
+ #
62
+ # ## Attributes
63
+ #
64
+ # Attributes returned by `on_attribute` are passed as an Hash as the 3rd
65
+ # argument of the `on_element` callback. The keys of this Hash are the
66
+ # attribute names (optionally prefixed by their namespace) and their values.
67
+ # You can overwrite `on_attribute` to control individual attributes and
68
+ # `on_attributes` to control the final set.
69
+ #
70
+ class SaxParser < Parser
71
+ ##
72
+ # @param [Object] handler The SAX handler to delegate callbacks to.
73
+ # @see [Oga::XML::Parser#initialize]
74
+ #
75
+ def initialize(handler, *args)
76
+ @handler = handler
77
+
78
+ super(*args)
79
+ end
80
+
81
+ # Delegate all callbacks to the handler object.
82
+ instance_methods.grep(/^(on_|after_)/).each do |method|
83
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
84
+ def #{method}(*args)
85
+ run_callback(:#{method}, *args)
86
+
87
+ return
88
+ end
89
+ EOF
90
+ end
91
+
92
+ ##
93
+ # Manually overwrite `on_element` so we can ensure that `after_element`
94
+ # always receives the namespace and name.
95
+ #
96
+ # @see [Oga::XML::Parser#on_element]
97
+ # @return [Array]
98
+ #
99
+ def on_element(namespace, name, attrs = [])
100
+ run_callback(:on_element, namespace, name, attrs)
101
+
102
+ [namespace, name]
103
+ end
104
+
105
+ ##
106
+ # Manually overwrite `after_element` so it can take a namespace and name.
107
+ # This differs a bit from the regular `after_element` which only takes an
108
+ # {Oga::XML::Element} instance.
109
+ #
110
+ # @param [Array] namespace_with_name
111
+ #
112
+ def after_element(namespace_with_name)
113
+ run_callback(:after_element, *namespace_with_name)
114
+
115
+ return
116
+ end
117
+
118
+ ##
119
+ # Manually overwrite this method since for this one we _do_ want the
120
+ # return value so it can be passed to `on_element`.
121
+ #
122
+ # @see [Oga::XML::Parser#on_attribute]
123
+ #
124
+ def on_attribute(name, ns = nil, value = nil)
125
+ if @handler.respond_to?(:on_attribute)
126
+ return run_callback(:on_attribute, name, ns, value)
127
+ end
128
+
129
+ key = ns ? "#{ns}:#{name}" : name
130
+
131
+ if value
132
+ value = EntityDecoder.try_decode(value, @lexer.html?)
133
+ end
134
+
135
+ {key => value}
136
+ end
137
+
138
+ ##
139
+ # Merges the attributes together into a Hash.
140
+ #
141
+ # @param [Array] attrs
142
+ # @return [Hash]
143
+ #
144
+ def on_attributes(attrs)
145
+ if @handler.respond_to?(:on_attributes)
146
+ return run_callback(:on_attributes, attrs)
147
+ end
148
+
149
+ merged = {}
150
+
151
+ attrs.each do |pair|
152
+ # Hash#merge requires an extra allocation, this doesn't.
153
+ pair.each { |key, value| merged[key] = value }
154
+ end
155
+
156
+ merged
157
+ end
158
+
159
+ ##
160
+ # @param [String] text
161
+ #
162
+ def on_text(text)
163
+ if @handler.respond_to?(:on_text)
164
+ unless inside_literal_html?
165
+ text = EntityDecoder.try_decode(text, @lexer.html?)
166
+ end
167
+
168
+ run_callback(:on_text, text)
169
+ end
170
+
171
+ return
172
+ end
173
+
174
+ private
175
+
176
+ ##
177
+ # @return [TrueClass|FalseClass]
178
+ #
179
+ def inside_literal_html?
180
+ @lexer.html_script? || @lexer.html_style?
181
+ end
182
+
183
+ ##
184
+ # @param [Symbol] method
185
+ # @param [Array] args
186
+ #
187
+ def run_callback(method, *args)
188
+ @handler.send(method, *args) if @handler.respond_to?(method)
189
+ end
190
+ end # SaxParser
191
+ end # XML
192
+ end # Oga