oga 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +19 -0
- data/README.md +171 -0
- data/doc/DCO.md +25 -0
- data/doc/changelog.md +7 -0
- data/doc/css/common.css +76 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/ext/c/extconf.rb +13 -0
- data/ext/c/lexer.c +1518 -0
- data/ext/c/lexer.h +8 -0
- data/ext/c/lexer.rl +121 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +829 -0
- data/ext/java/org/liboga/xml/Lexer.rl +151 -0
- data/ext/ragel/base_lexer.rl +323 -0
- data/lib/oga.rb +43 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/oga.rb +27 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/xml/attribute.rb +111 -0
- data/lib/oga/xml/cdata.rb +24 -0
- data/lib/oga/xml/character_node.rb +39 -0
- data/lib/oga/xml/comment.rb +24 -0
- data/lib/oga/xml/doctype.rb +91 -0
- data/lib/oga/xml/document.rb +99 -0
- data/lib/oga/xml/element.rb +340 -0
- data/lib/oga/xml/lexer.rb +399 -0
- data/lib/oga/xml/namespace.rb +42 -0
- data/lib/oga/xml/node.rb +175 -0
- data/lib/oga/xml/node_set.rb +313 -0
- data/lib/oga/xml/parser.rb +556 -0
- data/lib/oga/xml/processing_instruction.rb +39 -0
- data/lib/oga/xml/pull_parser.rb +166 -0
- data/lib/oga/xml/querying.rb +32 -0
- data/lib/oga/xml/text.rb +16 -0
- data/lib/oga/xml/traversal.rb +48 -0
- data/lib/oga/xml/xml_declaration.rb +76 -0
- data/lib/oga/xpath/evaluator.rb +1748 -0
- data/lib/oga/xpath/lexer.rb +2043 -0
- data/lib/oga/xpath/node.rb +10 -0
- data/lib/oga/xpath/parser.rb +535 -0
- data/oga.gemspec +45 -0
- metadata +221 -0
data/lib/oga.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'ast'
|
2
|
+
require 'set'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
require_relative 'oga/version'
|
6
|
+
require_relative 'oga/oga'
|
7
|
+
|
8
|
+
# Load these first so that the native extensions don't have to define the
|
9
|
+
# Oga::XML namespace.
|
10
|
+
require_relative 'oga/xml/lexer'
|
11
|
+
require_relative 'oga/xml/parser'
|
12
|
+
require_relative 'oga/xml/pull_parser'
|
13
|
+
|
14
|
+
require_relative 'liboga'
|
15
|
+
|
16
|
+
#:nocov:
|
17
|
+
if RUBY_PLATFORM == 'java'
|
18
|
+
org.liboga.Liboga.load(JRuby.runtime)
|
19
|
+
end
|
20
|
+
#:nocov:
|
21
|
+
|
22
|
+
require_relative 'oga/xml/querying'
|
23
|
+
require_relative 'oga/xml/traversal'
|
24
|
+
require_relative 'oga/xml/node'
|
25
|
+
require_relative 'oga/xml/document'
|
26
|
+
require_relative 'oga/xml/character_node'
|
27
|
+
require_relative 'oga/xml/text'
|
28
|
+
require_relative 'oga/xml/comment'
|
29
|
+
require_relative 'oga/xml/cdata'
|
30
|
+
require_relative 'oga/xml/xml_declaration'
|
31
|
+
require_relative 'oga/xml/processing_instruction'
|
32
|
+
require_relative 'oga/xml/doctype'
|
33
|
+
require_relative 'oga/xml/namespace'
|
34
|
+
require_relative 'oga/xml/attribute'
|
35
|
+
require_relative 'oga/xml/element'
|
36
|
+
require_relative 'oga/xml/node_set'
|
37
|
+
|
38
|
+
require_relative 'oga/html/parser'
|
39
|
+
|
40
|
+
require_relative 'oga/xpath/node'
|
41
|
+
require_relative 'oga/xpath/lexer'
|
42
|
+
require_relative 'oga/xpath/parser'
|
43
|
+
require_relative 'oga/xpath/evaluator'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# Parser for processing HTML input. This parser is a small wrapper around
|
5
|
+
# {Oga::XML::Parser} and takes care of setting the various options required
|
6
|
+
# for parsing HTML documents.
|
7
|
+
#
|
8
|
+
# A basic example:
|
9
|
+
#
|
10
|
+
# Oga::HTML::Parser.new('<meta charset="utf-8">').parse
|
11
|
+
#
|
12
|
+
class Parser < XML::Parser
|
13
|
+
##
|
14
|
+
# @param [String|IO] data
|
15
|
+
# @param [Hash] options
|
16
|
+
# @see [Oga::XML::Parser#initialize]
|
17
|
+
#
|
18
|
+
def initialize(data, options = {})
|
19
|
+
options = options.merge(:html => true)
|
20
|
+
|
21
|
+
super(data, options)
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # HTML
|
25
|
+
end # Oga
|
data/lib/oga/oga.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Parses the given XML document.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# document = Oga.parse_xml('<root>Hello</root>')
|
7
|
+
#
|
8
|
+
# @param [String|IO] xml The XML input to parse.
|
9
|
+
# @return [Oga::XML::Document]
|
10
|
+
#
|
11
|
+
def self.parse_xml(xml)
|
12
|
+
return XML::Parser.new(xml).parse
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Parses the given HTML document.
|
17
|
+
#
|
18
|
+
# @example
|
19
|
+
# document = Oga.parse_html('<html>...</html>')
|
20
|
+
#
|
21
|
+
# @param [String|IO] html The HTML input to parse.
|
22
|
+
# @return [Oga::XML::Document]
|
23
|
+
#
|
24
|
+
def self.parse_html(html)
|
25
|
+
return HTML::Parser.new(html).parse
|
26
|
+
end
|
27
|
+
end # Oga
|
data/lib/oga/version.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class for storing information about a single XML attribute.
|
5
|
+
#
|
6
|
+
# @!attribute [rw] name
|
7
|
+
# The name of the attribute.
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
# @!attribute [rw] namespace_name
|
11
|
+
# @return [String]
|
12
|
+
#
|
13
|
+
# @!attribute [rw] value
|
14
|
+
# The value of the attribute.
|
15
|
+
# @return [String]
|
16
|
+
#
|
17
|
+
# @!attribute [r] element
|
18
|
+
# The element this attribute belongs to.
|
19
|
+
# @return [Oga::XML::Element]
|
20
|
+
#
|
21
|
+
class Attribute
|
22
|
+
attr_accessor :name, :namespace_name, :element, :value
|
23
|
+
|
24
|
+
##
|
25
|
+
# The default namespace available to all attributes. This namespace can
|
26
|
+
# not be modified.
|
27
|
+
#
|
28
|
+
# @return [Oga::XML::Namespace]
|
29
|
+
#
|
30
|
+
DEFAULT_NAMESPACE = Namespace.new(
|
31
|
+
:name => 'xml',
|
32
|
+
:uri => 'http://www.w3.org/XML/1998/namespace'
|
33
|
+
).freeze
|
34
|
+
|
35
|
+
##
|
36
|
+
# @param [Hash] options
|
37
|
+
#
|
38
|
+
# @option options [String] :name
|
39
|
+
# @option options [String] :namespace_name
|
40
|
+
# @option options [String] :value
|
41
|
+
# @option options [Oga::XML::Element] :element
|
42
|
+
#
|
43
|
+
def initialize(options = {})
|
44
|
+
@name = options[:name]
|
45
|
+
@value = options[:value]
|
46
|
+
@element = options[:element]
|
47
|
+
|
48
|
+
@namespace_name = options[:namespace_name]
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Returns the {Oga::XML::Namespace} instance for the current namespace
|
53
|
+
# name.
|
54
|
+
#
|
55
|
+
# @return [Oga::XML::Namespace]
|
56
|
+
#
|
57
|
+
def namespace
|
58
|
+
unless @namespace
|
59
|
+
if namespace_name == DEFAULT_NAMESPACE.name
|
60
|
+
@namespace = DEFAULT_NAMESPACE
|
61
|
+
else
|
62
|
+
@namespace = element.available_namespaces[namespace_name]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
return @namespace
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Returns the value of the attribute.
|
71
|
+
#
|
72
|
+
# @return [String]
|
73
|
+
#
|
74
|
+
def text
|
75
|
+
return value.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
alias_method :to_s, :text
|
79
|
+
|
80
|
+
##
|
81
|
+
# @return [String]
|
82
|
+
#
|
83
|
+
def to_xml
|
84
|
+
if namespace_name
|
85
|
+
full_name = "#{namespace.name}:#{name}"
|
86
|
+
else
|
87
|
+
full_name = name
|
88
|
+
end
|
89
|
+
|
90
|
+
return %Q(#{full_name}="#{value}")
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# @return [String]
|
95
|
+
#
|
96
|
+
def inspect
|
97
|
+
segments = []
|
98
|
+
|
99
|
+
[:name, :namespace, :value].each do |attr|
|
100
|
+
value = send(attr)
|
101
|
+
|
102
|
+
if value
|
103
|
+
segments << "#{attr}: #{value.inspect}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
return "Attribute(#{segments.join(' ')})"
|
108
|
+
end
|
109
|
+
end # Attribute
|
110
|
+
end # XML
|
111
|
+
end # Oga
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about CDATA tags.
|
5
|
+
#
|
6
|
+
class Cdata < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
return "<![CDATA[#{text}]]>"
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
19
|
+
def node_type
|
20
|
+
return :cdata
|
21
|
+
end
|
22
|
+
end # Cdata
|
23
|
+
end # XML
|
24
|
+
end # Oga
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Base class for nodes that represent a text-like value such as Text and
|
5
|
+
# Comment nodes.
|
6
|
+
#
|
7
|
+
# @!attribute [rw] text
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
class CharacterNode < Node
|
11
|
+
attr_accessor :text
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param [Hash] options
|
15
|
+
#
|
16
|
+
# @option options [String] :text The text of the node.
|
17
|
+
#
|
18
|
+
def initialize(options = {})
|
19
|
+
super
|
20
|
+
|
21
|
+
@text = options[:text]
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @return [String]
|
26
|
+
#
|
27
|
+
def to_xml
|
28
|
+
return text.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# @return [String]
|
33
|
+
#
|
34
|
+
def inspect
|
35
|
+
return "#{self.class.to_s.split('::').last}(#{text.inspect})"
|
36
|
+
end
|
37
|
+
end # CharacterNode
|
38
|
+
end # XML
|
39
|
+
end # Oga
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about XML comments.
|
5
|
+
#
|
6
|
+
class Comment < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
return "<!--#{text}-->"
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
19
|
+
def node_type
|
20
|
+
return :comment
|
21
|
+
end
|
22
|
+
end # Comment
|
23
|
+
end # XML
|
24
|
+
end # Oga
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about Doctypes.
|
5
|
+
#
|
6
|
+
# @!attribute [rw] name
|
7
|
+
# The name of the doctype (e.g. "HTML").
|
8
|
+
# @return [String]
|
9
|
+
#
|
10
|
+
# @!attribute [rw] type
|
11
|
+
# The type of the doctype (e.g. "PUBLIC").
|
12
|
+
# @return [String]
|
13
|
+
#
|
14
|
+
# @!attribute [rw] public_id
|
15
|
+
# The public ID of the doctype.
|
16
|
+
# @return [String]
|
17
|
+
#
|
18
|
+
# @!attribute [rw] system_id
|
19
|
+
# The system ID of the doctype.
|
20
|
+
# @return [String]
|
21
|
+
#
|
22
|
+
# @!attribute [rw] inline_rules
|
23
|
+
# The inline doctype rules.
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
class Doctype
|
27
|
+
attr_accessor :name, :type, :public_id, :system_id, :inline_rules
|
28
|
+
|
29
|
+
##
|
30
|
+
# @example
|
31
|
+
# dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
|
32
|
+
#
|
33
|
+
# @param [Hash] options
|
34
|
+
#
|
35
|
+
# @option options [String] :name
|
36
|
+
# @option options [String] :type
|
37
|
+
# @option options [String] :public_id
|
38
|
+
# @option options [String] :system_id
|
39
|
+
#
|
40
|
+
def initialize(options = {})
|
41
|
+
@name = options[:name]
|
42
|
+
@type = options[:type]
|
43
|
+
@public_id = options[:public_id]
|
44
|
+
@system_id = options[:system_id]
|
45
|
+
@inline_rules = options[:inline_rules]
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Converts the doctype back to XML.
|
50
|
+
#
|
51
|
+
# @return [String]
|
52
|
+
#
|
53
|
+
def to_xml
|
54
|
+
segments = "<!DOCTYPE #{name}"
|
55
|
+
|
56
|
+
segments << " #{type}" if type
|
57
|
+
segments << %Q{ "#{public_id}"} if public_id
|
58
|
+
segments << %Q{ "#{system_id}"} if system_id
|
59
|
+
segments << " [#{inline_rules}]" if inline_rules
|
60
|
+
|
61
|
+
return segments + '>'
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Inspects the doctype.
|
66
|
+
#
|
67
|
+
# @return [String]
|
68
|
+
#
|
69
|
+
def inspect
|
70
|
+
segments = []
|
71
|
+
|
72
|
+
[:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
|
73
|
+
value = send(attr)
|
74
|
+
|
75
|
+
if value and !value.empty?
|
76
|
+
segments << "#{attr}: #{value.inspect}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
return "Doctype(#{segments.join(' ')})"
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# @return [Symbol]
|
85
|
+
#
|
86
|
+
def node_type
|
87
|
+
return :doctype
|
88
|
+
end
|
89
|
+
end # Doctype
|
90
|
+
end # XML
|
91
|
+
end # Oga
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about an entire XML document. This
|
5
|
+
# includes the doctype, XML declaration, child nodes and more.
|
6
|
+
#
|
7
|
+
# @!attribute [rw] doctype
|
8
|
+
# The doctype of the document.
|
9
|
+
# @return [Oga::XML::Doctype]
|
10
|
+
#
|
11
|
+
# @!attribute [rw] xml_declaration
|
12
|
+
# The XML declaration of the document.
|
13
|
+
# @return [Oga::XML::XmlDeclaration]
|
14
|
+
#
|
15
|
+
class Document
|
16
|
+
include Querying
|
17
|
+
include Traversal
|
18
|
+
|
19
|
+
attr_accessor :doctype, :xml_declaration
|
20
|
+
|
21
|
+
##
|
22
|
+
# @param [Hash] options
|
23
|
+
#
|
24
|
+
# @option options [Oga::XML::NodeSet] :children
|
25
|
+
# @option options [Oga::XML::Doctype] :doctype
|
26
|
+
# @option options [Oga::XML::XmlDeclaration] :xml_declaration
|
27
|
+
#
|
28
|
+
def initialize(options = {})
|
29
|
+
@doctype = options[:doctype]
|
30
|
+
@xml_declaration = options[:xml_declaration]
|
31
|
+
|
32
|
+
self.children = options[:children] if options[:children]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @return [Oga::XML::NodeSet]
|
37
|
+
#
|
38
|
+
def children
|
39
|
+
return @children ||= NodeSet.new([], self)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Sets the child nodes of the document.
|
44
|
+
#
|
45
|
+
# @param [Oga::XML::NodeSet|Array] nodes
|
46
|
+
#
|
47
|
+
def children=(nodes)
|
48
|
+
if nodes.is_a?(NodeSet)
|
49
|
+
@children = nodes
|
50
|
+
else
|
51
|
+
@children = NodeSet.new(nodes, self)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Converts the document and its child nodes to XML.
|
57
|
+
#
|
58
|
+
# @return [String]
|
59
|
+
#
|
60
|
+
def to_xml
|
61
|
+
xml = children.map(&:to_xml).join('')
|
62
|
+
|
63
|
+
if doctype
|
64
|
+
xml = doctype.to_xml + "\n" + xml.strip
|
65
|
+
end
|
66
|
+
|
67
|
+
if xml_declaration
|
68
|
+
xml = xml_declaration.to_xml + "\n" + xml.strip
|
69
|
+
end
|
70
|
+
|
71
|
+
return xml
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Inspects the document and its child nodes. Child nodes are indented for
|
76
|
+
# each nesting level.
|
77
|
+
#
|
78
|
+
# @return [String]
|
79
|
+
#
|
80
|
+
def inspect
|
81
|
+
segments = []
|
82
|
+
|
83
|
+
[:doctype, :xml_declaration, :children].each do |attr|
|
84
|
+
value = send(attr)
|
85
|
+
|
86
|
+
if value
|
87
|
+
segments << "#{attr}: #{value.inspect}"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
return <<-EOF.strip
|
92
|
+
Document(
|
93
|
+
#{segments.join("\n ")}
|
94
|
+
)
|
95
|
+
EOF
|
96
|
+
end
|
97
|
+
end # Document
|
98
|
+
end # XML
|
99
|
+
end # Oga
|