oga 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +171 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +7 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/oga.rb +43 -0
  20. data/lib/oga/html/parser.rb +25 -0
  21. data/lib/oga/oga.rb +27 -0
  22. data/lib/oga/version.rb +3 -0
  23. data/lib/oga/xml/attribute.rb +111 -0
  24. data/lib/oga/xml/cdata.rb +24 -0
  25. data/lib/oga/xml/character_node.rb +39 -0
  26. data/lib/oga/xml/comment.rb +24 -0
  27. data/lib/oga/xml/doctype.rb +91 -0
  28. data/lib/oga/xml/document.rb +99 -0
  29. data/lib/oga/xml/element.rb +340 -0
  30. data/lib/oga/xml/lexer.rb +399 -0
  31. data/lib/oga/xml/namespace.rb +42 -0
  32. data/lib/oga/xml/node.rb +175 -0
  33. data/lib/oga/xml/node_set.rb +313 -0
  34. data/lib/oga/xml/parser.rb +556 -0
  35. data/lib/oga/xml/processing_instruction.rb +39 -0
  36. data/lib/oga/xml/pull_parser.rb +166 -0
  37. data/lib/oga/xml/querying.rb +32 -0
  38. data/lib/oga/xml/text.rb +16 -0
  39. data/lib/oga/xml/traversal.rb +48 -0
  40. data/lib/oga/xml/xml_declaration.rb +76 -0
  41. data/lib/oga/xpath/evaluator.rb +1748 -0
  42. data/lib/oga/xpath/lexer.rb +2043 -0
  43. data/lib/oga/xpath/node.rb +10 -0
  44. data/lib/oga/xpath/parser.rb +535 -0
  45. data/oga.gemspec +45 -0
  46. metadata +221 -0
@@ -0,0 +1,43 @@
1
+ require 'ast'
2
+ require 'set'
3
+ require 'stringio'
4
+
5
+ require_relative 'oga/version'
6
+ require_relative 'oga/oga'
7
+
8
+ # Load these first so that the native extensions don't have to define the
9
+ # Oga::XML namespace.
10
+ require_relative 'oga/xml/lexer'
11
+ require_relative 'oga/xml/parser'
12
+ require_relative 'oga/xml/pull_parser'
13
+
14
+ require_relative 'liboga'
15
+
16
+ #:nocov:
17
+ if RUBY_PLATFORM == 'java'
18
+ org.liboga.Liboga.load(JRuby.runtime)
19
+ end
20
+ #:nocov:
21
+
22
+ require_relative 'oga/xml/querying'
23
+ require_relative 'oga/xml/traversal'
24
+ require_relative 'oga/xml/node'
25
+ require_relative 'oga/xml/document'
26
+ require_relative 'oga/xml/character_node'
27
+ require_relative 'oga/xml/text'
28
+ require_relative 'oga/xml/comment'
29
+ require_relative 'oga/xml/cdata'
30
+ require_relative 'oga/xml/xml_declaration'
31
+ require_relative 'oga/xml/processing_instruction'
32
+ require_relative 'oga/xml/doctype'
33
+ require_relative 'oga/xml/namespace'
34
+ require_relative 'oga/xml/attribute'
35
+ require_relative 'oga/xml/element'
36
+ require_relative 'oga/xml/node_set'
37
+
38
+ require_relative 'oga/html/parser'
39
+
40
+ require_relative 'oga/xpath/node'
41
+ require_relative 'oga/xpath/lexer'
42
+ require_relative 'oga/xpath/parser'
43
+ require_relative 'oga/xpath/evaluator'
@@ -0,0 +1,25 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # Parser for processing HTML input. This parser is a small wrapper around
5
+ # {Oga::XML::Parser} and takes care of setting the various options required
6
+ # for parsing HTML documents.
7
+ #
8
+ # A basic example:
9
+ #
10
+ # Oga::HTML::Parser.new('<meta charset="utf-8">').parse
11
+ #
12
+ class Parser < XML::Parser
13
+ ##
14
+ # @param [String|IO] data
15
+ # @param [Hash] options
16
+ # @see [Oga::XML::Parser#initialize]
17
+ #
18
+ def initialize(data, options = {})
19
+ options = options.merge(:html => true)
20
+
21
+ super(data, options)
22
+ end
23
+ end # Parser
24
+ end # HTML
25
+ end # Oga
@@ -0,0 +1,27 @@
1
+ module Oga
2
+ ##
3
+ # Parses the given XML document.
4
+ #
5
+ # @example
6
+ # document = Oga.parse_xml('<root>Hello</root>')
7
+ #
8
+ # @param [String|IO] xml The XML input to parse.
9
+ # @return [Oga::XML::Document]
10
+ #
11
+ def self.parse_xml(xml)
12
+ return XML::Parser.new(xml).parse
13
+ end
14
+
15
+ ##
16
+ # Parses the given HTML document.
17
+ #
18
+ # @example
19
+ # document = Oga.parse_html('<html>...</html>')
20
+ #
21
+ # @param [String|IO] html The HTML input to parse.
22
+ # @return [Oga::XML::Document]
23
+ #
24
+ def self.parse_html(html)
25
+ return HTML::Parser.new(html).parse
26
+ end
27
+ end # Oga
@@ -0,0 +1,3 @@
1
+ module Oga
2
+ VERSION = '0.1.0'
3
+ end # Oga
@@ -0,0 +1,111 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class for storing information about a single XML attribute.
5
+ #
6
+ # @!attribute [rw] name
7
+ # The name of the attribute.
8
+ # @return [String]
9
+ #
10
+ # @!attribute [rw] namespace_name
11
+ # @return [String]
12
+ #
13
+ # @!attribute [rw] value
14
+ # The value of the attribute.
15
+ # @return [String]
16
+ #
17
+ # @!attribute [r] element
18
+ # The element this attribute belongs to.
19
+ # @return [Oga::XML::Element]
20
+ #
21
+ class Attribute
22
+ attr_accessor :name, :namespace_name, :element, :value
23
+
24
+ ##
25
+ # The default namespace available to all attributes. This namespace can
26
+ # not be modified.
27
+ #
28
+ # @return [Oga::XML::Namespace]
29
+ #
30
+ DEFAULT_NAMESPACE = Namespace.new(
31
+ :name => 'xml',
32
+ :uri => 'http://www.w3.org/XML/1998/namespace'
33
+ ).freeze
34
+
35
+ ##
36
+ # @param [Hash] options
37
+ #
38
+ # @option options [String] :name
39
+ # @option options [String] :namespace_name
40
+ # @option options [String] :value
41
+ # @option options [Oga::XML::Element] :element
42
+ #
43
+ def initialize(options = {})
44
+ @name = options[:name]
45
+ @value = options[:value]
46
+ @element = options[:element]
47
+
48
+ @namespace_name = options[:namespace_name]
49
+ end
50
+
51
+ ##
52
+ # Returns the {Oga::XML::Namespace} instance for the current namespace
53
+ # name.
54
+ #
55
+ # @return [Oga::XML::Namespace]
56
+ #
57
+ def namespace
58
+ unless @namespace
59
+ if namespace_name == DEFAULT_NAMESPACE.name
60
+ @namespace = DEFAULT_NAMESPACE
61
+ else
62
+ @namespace = element.available_namespaces[namespace_name]
63
+ end
64
+ end
65
+
66
+ return @namespace
67
+ end
68
+
69
+ ##
70
+ # Returns the value of the attribute.
71
+ #
72
+ # @return [String]
73
+ #
74
+ def text
75
+ return value.to_s
76
+ end
77
+
78
+ alias_method :to_s, :text
79
+
80
+ ##
81
+ # @return [String]
82
+ #
83
+ def to_xml
84
+ if namespace_name
85
+ full_name = "#{namespace.name}:#{name}"
86
+ else
87
+ full_name = name
88
+ end
89
+
90
+ return %Q(#{full_name}="#{value}")
91
+ end
92
+
93
+ ##
94
+ # @return [String]
95
+ #
96
+ def inspect
97
+ segments = []
98
+
99
+ [:name, :namespace, :value].each do |attr|
100
+ value = send(attr)
101
+
102
+ if value
103
+ segments << "#{attr}: #{value.inspect}"
104
+ end
105
+ end
106
+
107
+ return "Attribute(#{segments.join(' ')})"
108
+ end
109
+ end # Attribute
110
+ end # XML
111
+ end # Oga
@@ -0,0 +1,24 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about CDATA tags.
5
+ #
6
+ class Cdata < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ return "<![CDATA[#{text}]]>"
14
+ end
15
+
16
+ ##
17
+ # @return [Symbol]
18
+ #
19
+ def node_type
20
+ return :cdata
21
+ end
22
+ end # Cdata
23
+ end # XML
24
+ end # Oga
@@ -0,0 +1,39 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Base class for nodes that represent a text-like value such as Text and
5
+ # Comment nodes.
6
+ #
7
+ # @!attribute [rw] text
8
+ # @return [String]
9
+ #
10
+ class CharacterNode < Node
11
+ attr_accessor :text
12
+
13
+ ##
14
+ # @param [Hash] options
15
+ #
16
+ # @option options [String] :text The text of the node.
17
+ #
18
+ def initialize(options = {})
19
+ super
20
+
21
+ @text = options[:text]
22
+ end
23
+
24
+ ##
25
+ # @return [String]
26
+ #
27
+ def to_xml
28
+ return text.to_s
29
+ end
30
+
31
+ ##
32
+ # @return [String]
33
+ #
34
+ def inspect
35
+ return "#{self.class.to_s.split('::').last}(#{text.inspect})"
36
+ end
37
+ end # CharacterNode
38
+ end # XML
39
+ end # Oga
@@ -0,0 +1,24 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about XML comments.
5
+ #
6
+ class Comment < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ return "<!--#{text}-->"
14
+ end
15
+
16
+ ##
17
+ # @return [Symbol]
18
+ #
19
+ def node_type
20
+ return :comment
21
+ end
22
+ end # Comment
23
+ end # XML
24
+ end # Oga
@@ -0,0 +1,91 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about Doctypes.
5
+ #
6
+ # @!attribute [rw] name
7
+ # The name of the doctype (e.g. "HTML").
8
+ # @return [String]
9
+ #
10
+ # @!attribute [rw] type
11
+ # The type of the doctype (e.g. "PUBLIC").
12
+ # @return [String]
13
+ #
14
+ # @!attribute [rw] public_id
15
+ # The public ID of the doctype.
16
+ # @return [String]
17
+ #
18
+ # @!attribute [rw] system_id
19
+ # The system ID of the doctype.
20
+ # @return [String]
21
+ #
22
+ # @!attribute [rw] inline_rules
23
+ # The inline doctype rules.
24
+ # @return [String]
25
+ #
26
+ class Doctype
27
+ attr_accessor :name, :type, :public_id, :system_id, :inline_rules
28
+
29
+ ##
30
+ # @example
31
+ # dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
32
+ #
33
+ # @param [Hash] options
34
+ #
35
+ # @option options [String] :name
36
+ # @option options [String] :type
37
+ # @option options [String] :public_id
38
+ # @option options [String] :system_id
39
+ #
40
+ def initialize(options = {})
41
+ @name = options[:name]
42
+ @type = options[:type]
43
+ @public_id = options[:public_id]
44
+ @system_id = options[:system_id]
45
+ @inline_rules = options[:inline_rules]
46
+ end
47
+
48
+ ##
49
+ # Converts the doctype back to XML.
50
+ #
51
+ # @return [String]
52
+ #
53
+ def to_xml
54
+ segments = "<!DOCTYPE #{name}"
55
+
56
+ segments << " #{type}" if type
57
+ segments << %Q{ "#{public_id}"} if public_id
58
+ segments << %Q{ "#{system_id}"} if system_id
59
+ segments << " [#{inline_rules}]" if inline_rules
60
+
61
+ return segments + '>'
62
+ end
63
+
64
+ ##
65
+ # Inspects the doctype.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def inspect
70
+ segments = []
71
+
72
+ [:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
73
+ value = send(attr)
74
+
75
+ if value and !value.empty?
76
+ segments << "#{attr}: #{value.inspect}"
77
+ end
78
+ end
79
+
80
+ return "Doctype(#{segments.join(' ')})"
81
+ end
82
+
83
+ ##
84
+ # @return [Symbol]
85
+ #
86
+ def node_type
87
+ return :doctype
88
+ end
89
+ end # Doctype
90
+ end # XML
91
+ end # Oga
@@ -0,0 +1,99 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about an entire XML document. This
5
+ # includes the doctype, XML declaration, child nodes and more.
6
+ #
7
+ # @!attribute [rw] doctype
8
+ # The doctype of the document.
9
+ # @return [Oga::XML::Doctype]
10
+ #
11
+ # @!attribute [rw] xml_declaration
12
+ # The XML declaration of the document.
13
+ # @return [Oga::XML::XmlDeclaration]
14
+ #
15
+ class Document
16
+ include Querying
17
+ include Traversal
18
+
19
+ attr_accessor :doctype, :xml_declaration
20
+
21
+ ##
22
+ # @param [Hash] options
23
+ #
24
+ # @option options [Oga::XML::NodeSet] :children
25
+ # @option options [Oga::XML::Doctype] :doctype
26
+ # @option options [Oga::XML::XmlDeclaration] :xml_declaration
27
+ #
28
+ def initialize(options = {})
29
+ @doctype = options[:doctype]
30
+ @xml_declaration = options[:xml_declaration]
31
+
32
+ self.children = options[:children] if options[:children]
33
+ end
34
+
35
+ ##
36
+ # @return [Oga::XML::NodeSet]
37
+ #
38
+ def children
39
+ return @children ||= NodeSet.new([], self)
40
+ end
41
+
42
+ ##
43
+ # Sets the child nodes of the document.
44
+ #
45
+ # @param [Oga::XML::NodeSet|Array] nodes
46
+ #
47
+ def children=(nodes)
48
+ if nodes.is_a?(NodeSet)
49
+ @children = nodes
50
+ else
51
+ @children = NodeSet.new(nodes, self)
52
+ end
53
+ end
54
+
55
+ ##
56
+ # Converts the document and its child nodes to XML.
57
+ #
58
+ # @return [String]
59
+ #
60
+ def to_xml
61
+ xml = children.map(&:to_xml).join('')
62
+
63
+ if doctype
64
+ xml = doctype.to_xml + "\n" + xml.strip
65
+ end
66
+
67
+ if xml_declaration
68
+ xml = xml_declaration.to_xml + "\n" + xml.strip
69
+ end
70
+
71
+ return xml
72
+ end
73
+
74
+ ##
75
+ # Inspects the document and its child nodes. Child nodes are indented for
76
+ # each nesting level.
77
+ #
78
+ # @return [String]
79
+ #
80
+ def inspect
81
+ segments = []
82
+
83
+ [:doctype, :xml_declaration, :children].each do |attr|
84
+ value = send(attr)
85
+
86
+ if value
87
+ segments << "#{attr}: #{value.inspect}"
88
+ end
89
+ end
90
+
91
+ return <<-EOF.strip
92
+ Document(
93
+ #{segments.join("\n ")}
94
+ )
95
+ EOF
96
+ end
97
+ end # Document
98
+ end # XML
99
+ end # Oga