oga 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +171 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +7 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/oga.rb +43 -0
  20. data/lib/oga/html/parser.rb +25 -0
  21. data/lib/oga/oga.rb +27 -0
  22. data/lib/oga/version.rb +3 -0
  23. data/lib/oga/xml/attribute.rb +111 -0
  24. data/lib/oga/xml/cdata.rb +24 -0
  25. data/lib/oga/xml/character_node.rb +39 -0
  26. data/lib/oga/xml/comment.rb +24 -0
  27. data/lib/oga/xml/doctype.rb +91 -0
  28. data/lib/oga/xml/document.rb +99 -0
  29. data/lib/oga/xml/element.rb +340 -0
  30. data/lib/oga/xml/lexer.rb +399 -0
  31. data/lib/oga/xml/namespace.rb +42 -0
  32. data/lib/oga/xml/node.rb +175 -0
  33. data/lib/oga/xml/node_set.rb +313 -0
  34. data/lib/oga/xml/parser.rb +556 -0
  35. data/lib/oga/xml/processing_instruction.rb +39 -0
  36. data/lib/oga/xml/pull_parser.rb +166 -0
  37. data/lib/oga/xml/querying.rb +32 -0
  38. data/lib/oga/xml/text.rb +16 -0
  39. data/lib/oga/xml/traversal.rb +48 -0
  40. data/lib/oga/xml/xml_declaration.rb +76 -0
  41. data/lib/oga/xpath/evaluator.rb +1748 -0
  42. data/lib/oga/xpath/lexer.rb +2043 -0
  43. data/lib/oga/xpath/node.rb +10 -0
  44. data/lib/oga/xpath/parser.rb +535 -0
  45. data/oga.gemspec +45 -0
  46. metadata +221 -0
@@ -0,0 +1,43 @@
1
+ require 'ast'
2
+ require 'set'
3
+ require 'stringio'
4
+
5
+ require_relative 'oga/version'
6
+ require_relative 'oga/oga'
7
+
8
+ # Load these first so that the native extensions don't have to define the
9
+ # Oga::XML namespace.
10
+ require_relative 'oga/xml/lexer'
11
+ require_relative 'oga/xml/parser'
12
+ require_relative 'oga/xml/pull_parser'
13
+
14
+ require_relative 'liboga'
15
+
16
+ #:nocov:
17
+ if RUBY_PLATFORM == 'java'
18
+ org.liboga.Liboga.load(JRuby.runtime)
19
+ end
20
+ #:nocov:
21
+
22
+ require_relative 'oga/xml/querying'
23
+ require_relative 'oga/xml/traversal'
24
+ require_relative 'oga/xml/node'
25
+ require_relative 'oga/xml/document'
26
+ require_relative 'oga/xml/character_node'
27
+ require_relative 'oga/xml/text'
28
+ require_relative 'oga/xml/comment'
29
+ require_relative 'oga/xml/cdata'
30
+ require_relative 'oga/xml/xml_declaration'
31
+ require_relative 'oga/xml/processing_instruction'
32
+ require_relative 'oga/xml/doctype'
33
+ require_relative 'oga/xml/namespace'
34
+ require_relative 'oga/xml/attribute'
35
+ require_relative 'oga/xml/element'
36
+ require_relative 'oga/xml/node_set'
37
+
38
+ require_relative 'oga/html/parser'
39
+
40
+ require_relative 'oga/xpath/node'
41
+ require_relative 'oga/xpath/lexer'
42
+ require_relative 'oga/xpath/parser'
43
+ require_relative 'oga/xpath/evaluator'
@@ -0,0 +1,25 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # Parser for processing HTML input. This parser is a small wrapper around
5
+ # {Oga::XML::Parser} and takes care of setting the various options required
6
+ # for parsing HTML documents.
7
+ #
8
+ # A basic example:
9
+ #
10
+ # Oga::HTML::Parser.new('<meta charset="utf-8">').parse
11
+ #
12
+ class Parser < XML::Parser
13
+ ##
14
+ # @param [String|IO] data
15
+ # @param [Hash] options
16
+ # @see [Oga::XML::Parser#initialize]
17
+ #
18
+ def initialize(data, options = {})
19
+ options = options.merge(:html => true)
20
+
21
+ super(data, options)
22
+ end
23
+ end # Parser
24
+ end # HTML
25
+ end # Oga
@@ -0,0 +1,27 @@
1
+ module Oga
2
+ ##
3
+ # Parses the given XML document.
4
+ #
5
+ # @example
6
+ # document = Oga.parse_xml('<root>Hello</root>')
7
+ #
8
+ # @param [String|IO] xml The XML input to parse.
9
+ # @return [Oga::XML::Document]
10
+ #
11
+ def self.parse_xml(xml)
12
+ return XML::Parser.new(xml).parse
13
+ end
14
+
15
+ ##
16
+ # Parses the given HTML document.
17
+ #
18
+ # @example
19
+ # document = Oga.parse_html('<html>...</html>')
20
+ #
21
+ # @param [String|IO] html The HTML input to parse.
22
+ # @return [Oga::XML::Document]
23
+ #
24
+ def self.parse_html(html)
25
+ return HTML::Parser.new(html).parse
26
+ end
27
+ end # Oga
@@ -0,0 +1,3 @@
1
+ module Oga
2
+ VERSION = '0.1.0'
3
+ end # Oga
@@ -0,0 +1,111 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class for storing information about a single XML attribute.
5
+ #
6
+ # @!attribute [rw] name
7
+ # The name of the attribute.
8
+ # @return [String]
9
+ #
10
+ # @!attribute [rw] namespace_name
11
+ # @return [String]
12
+ #
13
+ # @!attribute [rw] value
14
+ # The value of the attribute.
15
+ # @return [String]
16
+ #
17
+ # @!attribute [r] element
18
+ # The element this attribute belongs to.
19
+ # @return [Oga::XML::Element]
20
+ #
21
+ class Attribute
22
+ attr_accessor :name, :namespace_name, :element, :value
23
+
24
+ ##
25
+ # The default namespace available to all attributes. This namespace can
26
+ # not be modified.
27
+ #
28
+ # @return [Oga::XML::Namespace]
29
+ #
30
+ DEFAULT_NAMESPACE = Namespace.new(
31
+ :name => 'xml',
32
+ :uri => 'http://www.w3.org/XML/1998/namespace'
33
+ ).freeze
34
+
35
+ ##
36
+ # @param [Hash] options
37
+ #
38
+ # @option options [String] :name
39
+ # @option options [String] :namespace_name
40
+ # @option options [String] :value
41
+ # @option options [Oga::XML::Element] :element
42
+ #
43
+ def initialize(options = {})
44
+ @name = options[:name]
45
+ @value = options[:value]
46
+ @element = options[:element]
47
+
48
+ @namespace_name = options[:namespace_name]
49
+ end
50
+
51
+ ##
52
+ # Returns the {Oga::XML::Namespace} instance for the current namespace
53
+ # name.
54
+ #
55
+ # @return [Oga::XML::Namespace]
56
+ #
57
+ def namespace
58
+ unless @namespace
59
+ if namespace_name == DEFAULT_NAMESPACE.name
60
+ @namespace = DEFAULT_NAMESPACE
61
+ else
62
+ @namespace = element.available_namespaces[namespace_name]
63
+ end
64
+ end
65
+
66
+ return @namespace
67
+ end
68
+
69
+ ##
70
+ # Returns the value of the attribute.
71
+ #
72
+ # @return [String]
73
+ #
74
+ def text
75
+ return value.to_s
76
+ end
77
+
78
+ alias_method :to_s, :text
79
+
80
+ ##
81
+ # @return [String]
82
+ #
83
+ def to_xml
84
+ if namespace_name
85
+ full_name = "#{namespace.name}:#{name}"
86
+ else
87
+ full_name = name
88
+ end
89
+
90
+ return %Q(#{full_name}="#{value}")
91
+ end
92
+
93
+ ##
94
+ # @return [String]
95
+ #
96
+ def inspect
97
+ segments = []
98
+
99
+ [:name, :namespace, :value].each do |attr|
100
+ value = send(attr)
101
+
102
+ if value
103
+ segments << "#{attr}: #{value.inspect}"
104
+ end
105
+ end
106
+
107
+ return "Attribute(#{segments.join(' ')})"
108
+ end
109
+ end # Attribute
110
+ end # XML
111
+ end # Oga
@@ -0,0 +1,24 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about CDATA tags.
5
+ #
6
+ class Cdata < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ return "<![CDATA[#{text}]]>"
14
+ end
15
+
16
+ ##
17
+ # @return [Symbol]
18
+ #
19
+ def node_type
20
+ return :cdata
21
+ end
22
+ end # Cdata
23
+ end # XML
24
+ end # Oga
@@ -0,0 +1,39 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Base class for nodes that represent a text-like value such as Text and
5
+ # Comment nodes.
6
+ #
7
+ # @!attribute [rw] text
8
+ # @return [String]
9
+ #
10
+ class CharacterNode < Node
11
+ attr_accessor :text
12
+
13
+ ##
14
+ # @param [Hash] options
15
+ #
16
+ # @option options [String] :text The text of the node.
17
+ #
18
+ def initialize(options = {})
19
+ super
20
+
21
+ @text = options[:text]
22
+ end
23
+
24
+ ##
25
+ # @return [String]
26
+ #
27
+ def to_xml
28
+ return text.to_s
29
+ end
30
+
31
+ ##
32
+ # @return [String]
33
+ #
34
+ def inspect
35
+ return "#{self.class.to_s.split('::').last}(#{text.inspect})"
36
+ end
37
+ end # CharacterNode
38
+ end # XML
39
+ end # Oga
@@ -0,0 +1,24 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about XML comments.
5
+ #
6
+ class Comment < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ return "<!--#{text}-->"
14
+ end
15
+
16
+ ##
17
+ # @return [Symbol]
18
+ #
19
+ def node_type
20
+ return :comment
21
+ end
22
+ end # Comment
23
+ end # XML
24
+ end # Oga
@@ -0,0 +1,91 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about Doctypes.
5
+ #
6
+ # @!attribute [rw] name
7
+ # The name of the doctype (e.g. "HTML").
8
+ # @return [String]
9
+ #
10
+ # @!attribute [rw] type
11
+ # The type of the doctype (e.g. "PUBLIC").
12
+ # @return [String]
13
+ #
14
+ # @!attribute [rw] public_id
15
+ # The public ID of the doctype.
16
+ # @return [String]
17
+ #
18
+ # @!attribute [rw] system_id
19
+ # The system ID of the doctype.
20
+ # @return [String]
21
+ #
22
+ # @!attribute [rw] inline_rules
23
+ # The inline doctype rules.
24
+ # @return [String]
25
+ #
26
+ class Doctype
27
+ attr_accessor :name, :type, :public_id, :system_id, :inline_rules
28
+
29
+ ##
30
+ # @example
31
+ # dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
32
+ #
33
+ # @param [Hash] options
34
+ #
35
+ # @option options [String] :name
36
+ # @option options [String] :type
37
+ # @option options [String] :public_id
38
+ # @option options [String] :system_id
39
+ #
40
+ def initialize(options = {})
41
+ @name = options[:name]
42
+ @type = options[:type]
43
+ @public_id = options[:public_id]
44
+ @system_id = options[:system_id]
45
+ @inline_rules = options[:inline_rules]
46
+ end
47
+
48
+ ##
49
+ # Converts the doctype back to XML.
50
+ #
51
+ # @return [String]
52
+ #
53
+ def to_xml
54
+ segments = "<!DOCTYPE #{name}"
55
+
56
+ segments << " #{type}" if type
57
+ segments << %Q{ "#{public_id}"} if public_id
58
+ segments << %Q{ "#{system_id}"} if system_id
59
+ segments << " [#{inline_rules}]" if inline_rules
60
+
61
+ return segments + '>'
62
+ end
63
+
64
+ ##
65
+ # Inspects the doctype.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def inspect
70
+ segments = []
71
+
72
+ [:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
73
+ value = send(attr)
74
+
75
+ if value and !value.empty?
76
+ segments << "#{attr}: #{value.inspect}"
77
+ end
78
+ end
79
+
80
+ return "Doctype(#{segments.join(' ')})"
81
+ end
82
+
83
+ ##
84
+ # @return [Symbol]
85
+ #
86
+ def node_type
87
+ return :doctype
88
+ end
89
+ end # Doctype
90
+ end # XML
91
+ end # Oga
@@ -0,0 +1,99 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about an entire XML document. This
5
+ # includes the doctype, XML declaration, child nodes and more.
6
+ #
7
+ # @!attribute [rw] doctype
8
+ # The doctype of the document.
9
+ # @return [Oga::XML::Doctype]
10
+ #
11
+ # @!attribute [rw] xml_declaration
12
+ # The XML declaration of the document.
13
+ # @return [Oga::XML::XmlDeclaration]
14
+ #
15
+ class Document
16
+ include Querying
17
+ include Traversal
18
+
19
+ attr_accessor :doctype, :xml_declaration
20
+
21
+ ##
22
+ # @param [Hash] options
23
+ #
24
+ # @option options [Oga::XML::NodeSet] :children
25
+ # @option options [Oga::XML::Doctype] :doctype
26
+ # @option options [Oga::XML::XmlDeclaration] :xml_declaration
27
+ #
28
+ def initialize(options = {})
29
+ @doctype = options[:doctype]
30
+ @xml_declaration = options[:xml_declaration]
31
+
32
+ self.children = options[:children] if options[:children]
33
+ end
34
+
35
+ ##
36
+ # @return [Oga::XML::NodeSet]
37
+ #
38
+ def children
39
+ return @children ||= NodeSet.new([], self)
40
+ end
41
+
42
+ ##
43
+ # Sets the child nodes of the document.
44
+ #
45
+ # @param [Oga::XML::NodeSet|Array] nodes
46
+ #
47
+ def children=(nodes)
48
+ if nodes.is_a?(NodeSet)
49
+ @children = nodes
50
+ else
51
+ @children = NodeSet.new(nodes, self)
52
+ end
53
+ end
54
+
55
+ ##
56
+ # Converts the document and its child nodes to XML.
57
+ #
58
+ # @return [String]
59
+ #
60
+ def to_xml
61
+ xml = children.map(&:to_xml).join('')
62
+
63
+ if doctype
64
+ xml = doctype.to_xml + "\n" + xml.strip
65
+ end
66
+
67
+ if xml_declaration
68
+ xml = xml_declaration.to_xml + "\n" + xml.strip
69
+ end
70
+
71
+ return xml
72
+ end
73
+
74
+ ##
75
+ # Inspects the document and its child nodes. Child nodes are indented for
76
+ # each nesting level.
77
+ #
78
+ # @return [String]
79
+ #
80
+ def inspect
81
+ segments = []
82
+
83
+ [:doctype, :xml_declaration, :children].each do |attr|
84
+ value = send(attr)
85
+
86
+ if value
87
+ segments << "#{attr}: #{value.inspect}"
88
+ end
89
+ end
90
+
91
+ return <<-EOF.strip
92
+ Document(
93
+ #{segments.join("\n ")}
94
+ )
95
+ EOF
96
+ end
97
+ end # Document
98
+ end # XML
99
+ end # Oga