oga 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +19 -0
- data/README.md +171 -0
- data/doc/DCO.md +25 -0
- data/doc/changelog.md +7 -0
- data/doc/css/common.css +76 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/ext/c/extconf.rb +13 -0
- data/ext/c/lexer.c +1518 -0
- data/ext/c/lexer.h +8 -0
- data/ext/c/lexer.rl +121 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +829 -0
- data/ext/java/org/liboga/xml/Lexer.rl +151 -0
- data/ext/ragel/base_lexer.rl +323 -0
- data/lib/oga.rb +43 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/oga.rb +27 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/xml/attribute.rb +111 -0
- data/lib/oga/xml/cdata.rb +24 -0
- data/lib/oga/xml/character_node.rb +39 -0
- data/lib/oga/xml/comment.rb +24 -0
- data/lib/oga/xml/doctype.rb +91 -0
- data/lib/oga/xml/document.rb +99 -0
- data/lib/oga/xml/element.rb +340 -0
- data/lib/oga/xml/lexer.rb +399 -0
- data/lib/oga/xml/namespace.rb +42 -0
- data/lib/oga/xml/node.rb +175 -0
- data/lib/oga/xml/node_set.rb +313 -0
- data/lib/oga/xml/parser.rb +556 -0
- data/lib/oga/xml/processing_instruction.rb +39 -0
- data/lib/oga/xml/pull_parser.rb +166 -0
- data/lib/oga/xml/querying.rb +32 -0
- data/lib/oga/xml/text.rb +16 -0
- data/lib/oga/xml/traversal.rb +48 -0
- data/lib/oga/xml/xml_declaration.rb +76 -0
- data/lib/oga/xpath/evaluator.rb +1748 -0
- data/lib/oga/xpath/lexer.rb +2043 -0
- data/lib/oga/xpath/node.rb +10 -0
- data/lib/oga/xpath/parser.rb +535 -0
- data/oga.gemspec +45 -0
- metadata +221 -0
data/lib/oga.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'ast'
|
2
|
+
require 'set'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
require_relative 'oga/version'
|
6
|
+
require_relative 'oga/oga'
|
7
|
+
|
8
|
+
# Load these first so that the native extensions don't have to define the
|
9
|
+
# Oga::XML namespace.
|
10
|
+
require_relative 'oga/xml/lexer'
|
11
|
+
require_relative 'oga/xml/parser'
|
12
|
+
require_relative 'oga/xml/pull_parser'
|
13
|
+
|
14
|
+
require_relative 'liboga'
|
15
|
+
|
16
|
+
#:nocov:
|
17
|
+
if RUBY_PLATFORM == 'java'
|
18
|
+
org.liboga.Liboga.load(JRuby.runtime)
|
19
|
+
end
|
20
|
+
#:nocov:
|
21
|
+
|
22
|
+
require_relative 'oga/xml/querying'
|
23
|
+
require_relative 'oga/xml/traversal'
|
24
|
+
require_relative 'oga/xml/node'
|
25
|
+
require_relative 'oga/xml/document'
|
26
|
+
require_relative 'oga/xml/character_node'
|
27
|
+
require_relative 'oga/xml/text'
|
28
|
+
require_relative 'oga/xml/comment'
|
29
|
+
require_relative 'oga/xml/cdata'
|
30
|
+
require_relative 'oga/xml/xml_declaration'
|
31
|
+
require_relative 'oga/xml/processing_instruction'
|
32
|
+
require_relative 'oga/xml/doctype'
|
33
|
+
require_relative 'oga/xml/namespace'
|
34
|
+
require_relative 'oga/xml/attribute'
|
35
|
+
require_relative 'oga/xml/element'
|
36
|
+
require_relative 'oga/xml/node_set'
|
37
|
+
|
38
|
+
require_relative 'oga/html/parser'
|
39
|
+
|
40
|
+
require_relative 'oga/xpath/node'
|
41
|
+
require_relative 'oga/xpath/lexer'
|
42
|
+
require_relative 'oga/xpath/parser'
|
43
|
+
require_relative 'oga/xpath/evaluator'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# Parser for processing HTML input. This parser is a small wrapper around
|
5
|
+
# {Oga::XML::Parser} and takes care of setting the various options required
|
6
|
+
# for parsing HTML documents.
|
7
|
+
#
|
8
|
+
# A basic example:
|
9
|
+
#
|
10
|
+
# Oga::HTML::Parser.new('<meta charset="utf-8">').parse
|
11
|
+
#
|
12
|
+
class Parser < XML::Parser
|
13
|
+
##
|
14
|
+
# @param [String|IO] data
|
15
|
+
# @param [Hash] options
|
16
|
+
# @see [Oga::XML::Parser#initialize]
|
17
|
+
#
|
18
|
+
def initialize(data, options = {})
|
19
|
+
options = options.merge(:html => true)
|
20
|
+
|
21
|
+
super(data, options)
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # HTML
|
25
|
+
end # Oga
|
data/lib/oga/oga.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Parses the given XML document.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# document = Oga.parse_xml('<root>Hello</root>')
|
7
|
+
#
|
8
|
+
# @param [String|IO] xml The XML input to parse.
|
9
|
+
# @return [Oga::XML::Document]
|
10
|
+
#
|
11
|
+
def self.parse_xml(xml)
|
12
|
+
return XML::Parser.new(xml).parse
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Parses the given HTML document.
|
17
|
+
#
|
18
|
+
# @example
|
19
|
+
# document = Oga.parse_html('<html>...</html>')
|
20
|
+
#
|
21
|
+
# @param [String|IO] html The HTML input to parse.
|
22
|
+
# @return [Oga::XML::Document]
|
23
|
+
#
|
24
|
+
def self.parse_html(html)
|
25
|
+
return HTML::Parser.new(html).parse
|
26
|
+
end
|
27
|
+
end # Oga
|
data/lib/oga/version.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class for storing information about a single XML attribute.
|
5
|
+
#
|
6
|
+
# @!attribute [rw] name
|
7
|
+
# The name of the attribute.
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
# @!attribute [rw] namespace_name
|
11
|
+
# @return [String]
|
12
|
+
#
|
13
|
+
# @!attribute [rw] value
|
14
|
+
# The value of the attribute.
|
15
|
+
# @return [String]
|
16
|
+
#
|
17
|
+
# @!attribute [r] element
|
18
|
+
# The element this attribute belongs to.
|
19
|
+
# @return [Oga::XML::Element]
|
20
|
+
#
|
21
|
+
class Attribute
|
22
|
+
attr_accessor :name, :namespace_name, :element, :value
|
23
|
+
|
24
|
+
##
|
25
|
+
# The default namespace available to all attributes. This namespace can
|
26
|
+
# not be modified.
|
27
|
+
#
|
28
|
+
# @return [Oga::XML::Namespace]
|
29
|
+
#
|
30
|
+
DEFAULT_NAMESPACE = Namespace.new(
|
31
|
+
:name => 'xml',
|
32
|
+
:uri => 'http://www.w3.org/XML/1998/namespace'
|
33
|
+
).freeze
|
34
|
+
|
35
|
+
##
|
36
|
+
# @param [Hash] options
|
37
|
+
#
|
38
|
+
# @option options [String] :name
|
39
|
+
# @option options [String] :namespace_name
|
40
|
+
# @option options [String] :value
|
41
|
+
# @option options [Oga::XML::Element] :element
|
42
|
+
#
|
43
|
+
def initialize(options = {})
|
44
|
+
@name = options[:name]
|
45
|
+
@value = options[:value]
|
46
|
+
@element = options[:element]
|
47
|
+
|
48
|
+
@namespace_name = options[:namespace_name]
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Returns the {Oga::XML::Namespace} instance for the current namespace
|
53
|
+
# name.
|
54
|
+
#
|
55
|
+
# @return [Oga::XML::Namespace]
|
56
|
+
#
|
57
|
+
def namespace
|
58
|
+
unless @namespace
|
59
|
+
if namespace_name == DEFAULT_NAMESPACE.name
|
60
|
+
@namespace = DEFAULT_NAMESPACE
|
61
|
+
else
|
62
|
+
@namespace = element.available_namespaces[namespace_name]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
return @namespace
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Returns the value of the attribute.
|
71
|
+
#
|
72
|
+
# @return [String]
|
73
|
+
#
|
74
|
+
def text
|
75
|
+
return value.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
alias_method :to_s, :text
|
79
|
+
|
80
|
+
##
|
81
|
+
# @return [String]
|
82
|
+
#
|
83
|
+
def to_xml
|
84
|
+
if namespace_name
|
85
|
+
full_name = "#{namespace.name}:#{name}"
|
86
|
+
else
|
87
|
+
full_name = name
|
88
|
+
end
|
89
|
+
|
90
|
+
return %Q(#{full_name}="#{value}")
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# @return [String]
|
95
|
+
#
|
96
|
+
def inspect
|
97
|
+
segments = []
|
98
|
+
|
99
|
+
[:name, :namespace, :value].each do |attr|
|
100
|
+
value = send(attr)
|
101
|
+
|
102
|
+
if value
|
103
|
+
segments << "#{attr}: #{value.inspect}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
return "Attribute(#{segments.join(' ')})"
|
108
|
+
end
|
109
|
+
end # Attribute
|
110
|
+
end # XML
|
111
|
+
end # Oga
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about CDATA tags.
|
5
|
+
#
|
6
|
+
class Cdata < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
return "<![CDATA[#{text}]]>"
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
19
|
+
def node_type
|
20
|
+
return :cdata
|
21
|
+
end
|
22
|
+
end # Cdata
|
23
|
+
end # XML
|
24
|
+
end # Oga
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Base class for nodes that represent a text-like value such as Text and
|
5
|
+
# Comment nodes.
|
6
|
+
#
|
7
|
+
# @!attribute [rw] text
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
class CharacterNode < Node
|
11
|
+
attr_accessor :text
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param [Hash] options
|
15
|
+
#
|
16
|
+
# @option options [String] :text The text of the node.
|
17
|
+
#
|
18
|
+
def initialize(options = {})
|
19
|
+
super
|
20
|
+
|
21
|
+
@text = options[:text]
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @return [String]
|
26
|
+
#
|
27
|
+
def to_xml
|
28
|
+
return text.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# @return [String]
|
33
|
+
#
|
34
|
+
def inspect
|
35
|
+
return "#{self.class.to_s.split('::').last}(#{text.inspect})"
|
36
|
+
end
|
37
|
+
end # CharacterNode
|
38
|
+
end # XML
|
39
|
+
end # Oga
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about XML comments.
|
5
|
+
#
|
6
|
+
class Comment < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
return "<!--#{text}-->"
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
19
|
+
def node_type
|
20
|
+
return :comment
|
21
|
+
end
|
22
|
+
end # Comment
|
23
|
+
end # XML
|
24
|
+
end # Oga
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about Doctypes.
|
5
|
+
#
|
6
|
+
# @!attribute [rw] name
|
7
|
+
# The name of the doctype (e.g. "HTML").
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
# @!attribute [rw] type
|
11
|
+
# The type of the doctype (e.g. "PUBLIC").
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] public_id
|
15
|
+
# The public ID of the doctype.
|
16
|
+
# @return [String]
|
17
|
+
#
|
18
|
+
# @!attribute [rw] system_id
|
19
|
+
# The system ID of the doctype.
|
20
|
+
# @return [String]
|
21
|
+
#
|
22
|
+
# @!attribute [rw] inline_rules
|
23
|
+
# The inline doctype rules.
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
class Doctype
|
27
|
+
attr_accessor :name, :type, :public_id, :system_id, :inline_rules
|
28
|
+
|
29
|
+
##
|
30
|
+
# @example
|
31
|
+
# dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
|
32
|
+
#
|
33
|
+
# @param [Hash] options
|
34
|
+
#
|
35
|
+
# @option options [String] :name
|
36
|
+
# @option options [String] :type
|
37
|
+
# @option options [String] :public_id
|
38
|
+
# @option options [String] :system_id
|
39
|
+
#
|
40
|
+
def initialize(options = {})
|
41
|
+
@name = options[:name]
|
42
|
+
@type = options[:type]
|
43
|
+
@public_id = options[:public_id]
|
44
|
+
@system_id = options[:system_id]
|
45
|
+
@inline_rules = options[:inline_rules]
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Converts the doctype back to XML.
|
50
|
+
#
|
51
|
+
# @return [String]
|
52
|
+
#
|
53
|
+
def to_xml
|
54
|
+
segments = "<!DOCTYPE #{name}"
|
55
|
+
|
56
|
+
segments << " #{type}" if type
|
57
|
+
segments << %Q{ "#{public_id}"} if public_id
|
58
|
+
segments << %Q{ "#{system_id}"} if system_id
|
59
|
+
segments << " [#{inline_rules}]" if inline_rules
|
60
|
+
|
61
|
+
return segments + '>'
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Inspects the doctype.
|
66
|
+
#
|
67
|
+
# @return [String]
|
68
|
+
#
|
69
|
+
def inspect
|
70
|
+
segments = []
|
71
|
+
|
72
|
+
[:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
|
73
|
+
value = send(attr)
|
74
|
+
|
75
|
+
if value and !value.empty?
|
76
|
+
segments << "#{attr}: #{value.inspect}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
return "Doctype(#{segments.join(' ')})"
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# @return [Symbol]
|
85
|
+
#
|
86
|
+
def node_type
|
87
|
+
return :doctype
|
88
|
+
end
|
89
|
+
end # Doctype
|
90
|
+
end # XML
|
91
|
+
end # Oga
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about an entire XML document. This
|
5
|
+
# includes the doctype, XML declaration, child nodes and more.
|
6
|
+
#
|
7
|
+
# @!attribute [rw] doctype
|
8
|
+
# The doctype of the document.
|
9
|
+
# @return [Oga::XML::Doctype]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] xml_declaration
|
12
|
+
# The XML declaration of the document.
|
13
|
+
# @return [Oga::XML::XmlDeclaration]
|
14
|
+
#
|
15
|
+
class Document
|
16
|
+
include Querying
|
17
|
+
include Traversal
|
18
|
+
|
19
|
+
attr_accessor :doctype, :xml_declaration
|
20
|
+
|
21
|
+
##
|
22
|
+
# @param [Hash] options
|
23
|
+
#
|
24
|
+
# @option options [Oga::XML::NodeSet] :children
|
25
|
+
# @option options [Oga::XML::Doctype] :doctype
|
26
|
+
# @option options [Oga::XML::XmlDeclaration] :xml_declaration
|
27
|
+
#
|
28
|
+
def initialize(options = {})
|
29
|
+
@doctype = options[:doctype]
|
30
|
+
@xml_declaration = options[:xml_declaration]
|
31
|
+
|
32
|
+
self.children = options[:children] if options[:children]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @return [Oga::XML::NodeSet]
|
37
|
+
#
|
38
|
+
def children
|
39
|
+
return @children ||= NodeSet.new([], self)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Sets the child nodes of the document.
|
44
|
+
#
|
45
|
+
# @param [Oga::XML::NodeSet|Array] nodes
|
46
|
+
#
|
47
|
+
def children=(nodes)
|
48
|
+
if nodes.is_a?(NodeSet)
|
49
|
+
@children = nodes
|
50
|
+
else
|
51
|
+
@children = NodeSet.new(nodes, self)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Converts the document and its child nodes to XML.
|
57
|
+
#
|
58
|
+
# @return [String]
|
59
|
+
#
|
60
|
+
def to_xml
|
61
|
+
xml = children.map(&:to_xml).join('')
|
62
|
+
|
63
|
+
if doctype
|
64
|
+
xml = doctype.to_xml + "\n" + xml.strip
|
65
|
+
end
|
66
|
+
|
67
|
+
if xml_declaration
|
68
|
+
xml = xml_declaration.to_xml + "\n" + xml.strip
|
69
|
+
end
|
70
|
+
|
71
|
+
return xml
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Inspects the document and its child nodes. Child nodes are indented for
|
76
|
+
# each nesting level.
|
77
|
+
#
|
78
|
+
# @return [String]
|
79
|
+
#
|
80
|
+
def inspect
|
81
|
+
segments = []
|
82
|
+
|
83
|
+
[:doctype, :xml_declaration, :children].each do |attr|
|
84
|
+
value = send(attr)
|
85
|
+
|
86
|
+
if value
|
87
|
+
segments << "#{attr}: #{value.inspect}"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
return <<-EOF.strip
|
92
|
+
Document(
|
93
|
+
#{segments.join("\n ")}
|
94
|
+
)
|
95
|
+
EOF
|
96
|
+
end
|
97
|
+
end # Document
|
98
|
+
end # XML
|
99
|
+
end # Oga
|