rubyjedi-oga 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,37 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about a single processing instruction.
5
+ #
6
+ class ProcessingInstruction < CharacterNode
7
+ # @return [String]
8
+ attr_accessor :name
9
+
10
+ ##
11
+ # @param [Hash] options
12
+ #
13
+ # @option options [String] :name The name of the instruction.
14
+ # @see [Oga::XML::CharacterNode#initialize]
15
+ #
16
+ def initialize(options = {})
17
+ super
18
+
19
+ @name = options[:name]
20
+ end
21
+
22
+ ##
23
+ # @return [String]
24
+ #
25
+ def to_xml
26
+ "<?#{name}#{text}?>"
27
+ end
28
+
29
+ ##
30
+ # @return [String]
31
+ #
32
+ def inspect
33
+ "ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"
34
+ end
35
+ end # ProcessingInstruction
36
+ end # XML
37
+ end # Oga
@@ -0,0 +1,175 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The PullParser class can be used to parse an XML document incrementally
5
+ # instead of parsing it as a whole. This results in lower memory usage and
6
+ # potentially faster parsing times. The downside is that pull parsers are
7
+ # typically more difficult to use compared to DOM parsers.
8
+ #
9
+ # Basic parsing using this class works as following:
10
+ #
11
+ # parser = Oga::XML::PullParser.new('... xml here ...')
12
+ #
13
+ # parser.parse do |node|
14
+ # if node.is_a?(Oga::XML::PullParser)
15
+ #
16
+ # end
17
+ # end
18
+ #
19
+ # This parses yields proper XML instances such as {Oga::XML::Element}.
20
+ # Doctypes and XML declarations are ignored by this parser.
21
+ #
22
+ class PullParser < Parser
23
+ # @return [Oga::XML::Node]
24
+ attr_reader :node
25
+
26
+ # Array containing the names of the currently nested elements.
27
+ # @return [Array]
28
+ attr_reader :nesting
29
+
30
+ ##
31
+ # @return [Array]
32
+ #
33
+ DISABLED_CALLBACKS = [
34
+ :on_document,
35
+ :on_doctype,
36
+ :on_xml_decl,
37
+ :on_element_children
38
+ ]
39
+
40
+ ##
41
+ # @return [Array]
42
+ #
43
+ BLOCK_CALLBACKS = [
44
+ :on_cdata,
45
+ :on_comment,
46
+ :on_text,
47
+ :on_proc_ins
48
+ ]
49
+
50
+ ##
51
+ # Returns the shorthands that can be used for various node classes.
52
+ #
53
+ # @return [Hash]
54
+ #
55
+ NODE_SHORTHANDS = {
56
+ :text => XML::Text,
57
+ :node => XML::Node,
58
+ :cdata => XML::Cdata,
59
+ :element => XML::Element,
60
+ :doctype => XML::Doctype,
61
+ :comment => XML::Comment,
62
+ :xml_declaration => XML::XmlDeclaration
63
+ }
64
+
65
+ ##
66
+ # @see Oga::XML::Parser#reset
67
+ #
68
+ def reset
69
+ super
70
+
71
+ @block = nil
72
+ @nesting = []
73
+ @node = nil
74
+ end
75
+
76
+ ##
77
+ # Parses the input and yields every node to the supplied block.
78
+ #
79
+ # @yieldparam [Oga::XML::Node]
80
+ #
81
+ def parse(&block)
82
+ @block = block
83
+
84
+ super
85
+
86
+ return
87
+ end
88
+
89
+ ##
90
+ # Calls the supplied block if the current node type and optionally the
91
+ # nesting match. This method allows you to write this:
92
+ #
93
+ # parser.parse do |node|
94
+ # parser.on(:text, %w{people person name}) do
95
+ # puts node.text
96
+ # end
97
+ # end
98
+ #
99
+ # Instead of this:
100
+ #
101
+ # parser.parse do |node|
102
+ # if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
103
+ # puts node.text
104
+ # end
105
+ # end
106
+ #
107
+ # When calling this method you can specify the following node types:
108
+ #
109
+ # * `:cdata`
110
+ # * `:comment`
111
+ # * `:element`
112
+ # * `:text`
113
+ #
114
+ # @example
115
+ # parser.on(:element, %w{people person name}) do
116
+ #
117
+ # end
118
+ #
119
+ # @param [Symbol] type The type of node to act upon. This is a symbol as
120
+ # returned by {Oga::XML::Node#node_type}.
121
+ #
122
+ # @param [Array] nesting The element name nesting to act upon.
123
+ #
124
+ def on(type, nesting = [])
125
+ if node.is_a?(NODE_SHORTHANDS[type])
126
+ if nesting.empty? or nesting == self.nesting
127
+ yield
128
+ end
129
+ end
130
+ end
131
+
132
+ # eval is a heck of a lot faster than define_method on both Rubinius and
133
+ # JRuby.
134
+ DISABLED_CALLBACKS.each do |method|
135
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
136
+ def #{method}(*args)
137
+ return
138
+ end
139
+ EOF
140
+ end
141
+
142
+ BLOCK_CALLBACKS.each do |method|
143
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
144
+ def #{method}(*args)
145
+ @node = super
146
+ @block.call(@node)
147
+ return
148
+ end
149
+ EOF
150
+ end
151
+
152
+ ##
153
+ # @see Oga::XML::Parser#on_element
154
+ #
155
+ def on_element(*args)
156
+ @node = super
157
+
158
+ nesting << @node.name
159
+
160
+ @block.call(@node)
161
+
162
+ return
163
+ end
164
+
165
+ ##
166
+ # @see Oga::XML::Parser#on_element_children
167
+ #
168
+ def after_element(*args)
169
+ nesting.pop
170
+
171
+ return
172
+ end
173
+ end # PullParser
174
+ end # XML
175
+ end # Oga
@@ -0,0 +1,56 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The Querying module provides methods that make it easy to run XPath/CSS
5
+ # queries on XML documents/elements.
6
+ #
7
+ module Querying
8
+ ##
9
+ # Evaluates the given XPath expression.
10
+ #
11
+ # @param [String] expression The XPath expression to run.
12
+ # @param [Hash] variables Variables to bind.
13
+ # @see [Oga::XPath::Evaluator#initialize]
14
+ #
15
+ def xpath(expression, variables = {})
16
+ XPath::Evaluator.new(self, variables).evaluate(expression)
17
+ end
18
+
19
+ ##
20
+ # Evaluates the given XPath expression and returns the first node in the
21
+ # set.
22
+ #
23
+ # @see [#xpath]
24
+ #
25
+ def at_xpath(*args)
26
+ result = xpath(*args)
27
+
28
+ result.is_a?(XML::NodeSet) ? result.first : result
29
+ end
30
+
31
+ ##
32
+ # Evaluates the given CSS expression.
33
+ #
34
+ # @param [String] expression The CSS expression to run.
35
+ # @see [Oga::XPath::Evaluator#initialize]
36
+ #
37
+ def css(expression)
38
+ ast = CSS::Parser.parse_with_cache(expression)
39
+
40
+ XPath::Evaluator.new(self).evaluate_ast(ast)
41
+ end
42
+
43
+ ##
44
+ # Evaluates the given CSS expression and returns the first node in the
45
+ # set.
46
+ #
47
+ # @see [#css]
48
+ #
49
+ def at_css(*args)
50
+ result = css(*args)
51
+
52
+ result.is_a?(XML::NodeSet) ? result.first : result
53
+ end
54
+ end # Querying
55
+ end # XML
56
+ end # Oga
@@ -0,0 +1,192 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The SaxParser class provides the basic interface for writing custom SAX
5
+ # parsers. All callback methods defined in {Oga::XML::Parser} are delegated
6
+ # to a dedicated handler class.
7
+ #
8
+ # To write a custom handler for the SAX parser, create a class that
9
+ # implements one (or many) of the following callback methods:
10
+ #
11
+ # * `on_document`
12
+ # * `on_doctype`
13
+ # * `on_cdata`
14
+ # * `on_comment`
15
+ # * `on_proc_ins`
16
+ # * `on_xml_decl`
17
+ # * `on_text`
18
+ # * `on_element`
19
+ # * `on_element_children`
20
+ # * `on_attribute`
21
+ # * `on_attributes`
22
+ # * `after_element`
23
+ #
24
+ # For example:
25
+ #
26
+ # class SaxHandler
27
+ # def on_element(namespace, name, attrs = {})
28
+ # puts name
29
+ # end
30
+ # end
31
+ #
32
+ # You can then use it as following:
33
+ #
34
+ # handler = SaxHandler.new
35
+ # parser = Oga::XML::SaxParser.new(handler, '<foo />')
36
+ #
37
+ # parser.parse
38
+ #
39
+ # For information on the callback arguments see the documentation of the
40
+ # corresponding methods in {Oga::XML::Parser}.
41
+ #
42
+ # ## Element Callbacks
43
+ #
44
+ # The SAX parser changes the behaviour of both `on_element` and
45
+ # `after_element`. The latter in the regular parser only takes a
46
+ # {Oga::XML::Element} instance. In the SAX parser it will instead take a
47
+ # namespace name and the element name. This eases the process of figuring
48
+ # out what element a callback is associated with.
49
+ #
50
+ # An example:
51
+ #
52
+ # class SaxHandler
53
+ # def on_element(namespace, name, attrs = {})
54
+ # # ...
55
+ # end
56
+ #
57
+ # def after_element(namespace, name)
58
+ # puts name # => "foo", "bar", etc
59
+ # end
60
+ # end
61
+ #
62
+ # ## Attributes
63
+ #
64
+ # Attributes returned by `on_attribute` are passed as an Hash as the 3rd
65
+ # argument of the `on_element` callback. The keys of this Hash are the
66
+ # attribute names (optionally prefixed by their namespace) and their values.
67
+ # You can overwrite `on_attribute` to control individual attributes and
68
+ # `on_attributes` to control the final set.
69
+ #
70
+ class SaxParser < Parser
71
+ ##
72
+ # @param [Object] handler The SAX handler to delegate callbacks to.
73
+ # @see [Oga::XML::Parser#initialize]
74
+ #
75
+ def initialize(handler, *args)
76
+ @handler = handler
77
+
78
+ super(*args)
79
+ end
80
+
81
+ # Delegate all callbacks to the handler object.
82
+ instance_methods.grep(/^(on_|after_)/).each do |method|
83
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
84
+ def #{method}(*args)
85
+ run_callback(:#{method}, *args)
86
+
87
+ return
88
+ end
89
+ EOF
90
+ end
91
+
92
+ ##
93
+ # Manually overwrite `on_element` so we can ensure that `after_element`
94
+ # always receives the namespace and name.
95
+ #
96
+ # @see [Oga::XML::Parser#on_element]
97
+ # @return [Array]
98
+ #
99
+ def on_element(namespace, name, attrs = [])
100
+ run_callback(:on_element, namespace, name, attrs)
101
+
102
+ [namespace, name]
103
+ end
104
+
105
+ ##
106
+ # Manually overwrite `after_element` so it can take a namespace and name.
107
+ # This differs a bit from the regular `after_element` which only takes an
108
+ # {Oga::XML::Element} instance.
109
+ #
110
+ # @param [Array] namespace_with_name
111
+ #
112
+ def after_element(namespace_with_name)
113
+ run_callback(:after_element, *namespace_with_name)
114
+
115
+ return
116
+ end
117
+
118
+ ##
119
+ # Manually overwrite this method since for this one we _do_ want the
120
+ # return value so it can be passed to `on_element`.
121
+ #
122
+ # @see [Oga::XML::Parser#on_attribute]
123
+ #
124
+ def on_attribute(name, ns = nil, value = nil)
125
+ if @handler.respond_to?(:on_attribute)
126
+ return run_callback(:on_attribute, name, ns, value)
127
+ end
128
+
129
+ key = ns ? "#{ns}:#{name}" : name
130
+
131
+ if value
132
+ value = EntityDecoder.try_decode(value, @lexer.html?)
133
+ end
134
+
135
+ {key => value}
136
+ end
137
+
138
+ ##
139
+ # Merges the attributes together into a Hash.
140
+ #
141
+ # @param [Array] attrs
142
+ # @return [Hash]
143
+ #
144
+ def on_attributes(attrs)
145
+ if @handler.respond_to?(:on_attributes)
146
+ return run_callback(:on_attributes, attrs)
147
+ end
148
+
149
+ merged = {}
150
+
151
+ attrs.each do |pair|
152
+ # Hash#merge requires an extra allocation, this doesn't.
153
+ pair.each { |key, value| merged[key] = value }
154
+ end
155
+
156
+ merged
157
+ end
158
+
159
+ ##
160
+ # @param [String] text
161
+ #
162
+ def on_text(text)
163
+ if @handler.respond_to?(:on_text)
164
+ unless inside_literal_html?
165
+ text = EntityDecoder.try_decode(text, @lexer.html?)
166
+ end
167
+
168
+ run_callback(:on_text, text)
169
+ end
170
+
171
+ return
172
+ end
173
+
174
+ private
175
+
176
+ ##
177
+ # @return [TrueClass|FalseClass]
178
+ #
179
+ def inside_literal_html?
180
+ @lexer.html_script? || @lexer.html_style?
181
+ end
182
+
183
+ ##
184
+ # @param [Symbol] method
185
+ # @param [Array] args
186
+ #
187
+ def run_callback(method, *args)
188
+ @handler.send(method, *args) if @handler.respond_to?(method)
189
+ end
190
+ end # SaxParser
191
+ end # XML
192
+ end # Oga