rubyjedi-oga 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# Parser for processing HTML input. This parser is a small wrapper around
|
5
|
+
# {Oga::XML::Parser} and takes care of setting the various options required
|
6
|
+
# for parsing HTML documents.
|
7
|
+
#
|
8
|
+
# A basic example:
|
9
|
+
#
|
10
|
+
# Oga::HTML::Parser.new('<meta charset="utf-8">').parse
|
11
|
+
#
|
12
|
+
class Parser < XML::Parser
|
13
|
+
##
|
14
|
+
# @param [String|IO] data
|
15
|
+
# @param [Hash] options
|
16
|
+
# @see [Oga::XML::Parser#initialize]
|
17
|
+
#
|
18
|
+
def initialize(data, options = {})
|
19
|
+
options = options.merge(:html => true)
|
20
|
+
|
21
|
+
super(data, options)
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # HTML
|
25
|
+
end # Oga
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# SAX parser for HTML documents. See the documentation of
|
5
|
+
# {Oga::XML::SaxParser} for more information.
|
6
|
+
#
|
7
|
+
class SaxParser < XML::SaxParser
|
8
|
+
##
|
9
|
+
# @see [Oga::XML::SaxParser#initialize]
|
10
|
+
#
|
11
|
+
def initialize(handler, data, options = {})
|
12
|
+
options = options.merge(:html => true)
|
13
|
+
|
14
|
+
super(handler, data, options)
|
15
|
+
end
|
16
|
+
end # SaxParser
|
17
|
+
end # HTML
|
18
|
+
end # Oga
|
data/lib/oga/lru.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Thread-safe LRU cache using a Hash as the underlying storage engine.
|
4
|
+
# Whenever the size of the cache exceeds the given limit the oldest keys are
|
5
|
+
# removed (base on insert order).
|
6
|
+
#
|
7
|
+
# This class uses its own list of keys (as returned by {LRU#keys}) instead of
|
8
|
+
# relying on `Hash#keys` as the latter allocates a new Array upon every call.
|
9
|
+
#
|
10
|
+
# This class doesn't use MonitorMixin due to the extra overhead it adds
|
11
|
+
# compared to using a Mutex directly.
|
12
|
+
#
|
13
|
+
# Example usage:
|
14
|
+
#
|
15
|
+
# cache = LRU.new(3)
|
16
|
+
#
|
17
|
+
# cache[:a] = 10
|
18
|
+
# cache[:b] = 20
|
19
|
+
# cache[:c] = 30
|
20
|
+
# cache[:d] = 40
|
21
|
+
#
|
22
|
+
# cache.keys # => [:b, :c, :d]
|
23
|
+
#
|
24
|
+
# @api private
|
25
|
+
#
|
26
|
+
class LRU
|
27
|
+
##
|
28
|
+
# @param [Fixnum] maximum
|
29
|
+
#
|
30
|
+
def initialize(maximum = 1024)
|
31
|
+
@maximum = maximum
|
32
|
+
@cache = {}
|
33
|
+
@keys = []
|
34
|
+
@mutex = Mutex.new
|
35
|
+
@owner = Thread.current
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @param [Fixnum] value
|
40
|
+
#
|
41
|
+
def maximum=(value)
|
42
|
+
synchronize do
|
43
|
+
@maximum = value
|
44
|
+
|
45
|
+
resize
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# @return [Fixnum]
|
51
|
+
#
|
52
|
+
def maximum
|
53
|
+
synchronize { @maximum }
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Returns the value of the key.
|
58
|
+
#
|
59
|
+
# @param [Mixed] key
|
60
|
+
# @return [Mixed]
|
61
|
+
#
|
62
|
+
def [](key)
|
63
|
+
synchronize { @cache[key] }
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Sets the key and its value. Old keys are discarded if the LRU size exceeds
|
68
|
+
# the limit.
|
69
|
+
#
|
70
|
+
# @param [Mixed] key
|
71
|
+
# @param [Mixed] value
|
72
|
+
#
|
73
|
+
def []=(key, value)
|
74
|
+
synchronize do
|
75
|
+
@cache[key] = value
|
76
|
+
|
77
|
+
@keys.delete(key) if @keys.include?(key)
|
78
|
+
|
79
|
+
@keys << key
|
80
|
+
|
81
|
+
resize
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# Returns a key if it exists, otherwise yields the supplied block and uses
|
87
|
+
# its return value as the key value.
|
88
|
+
#
|
89
|
+
# @param [Mixed] key
|
90
|
+
# @return [Mixed]
|
91
|
+
#
|
92
|
+
def get_or_set(key)
|
93
|
+
synchronize { self[key] ||= yield }
|
94
|
+
end
|
95
|
+
|
96
|
+
##
|
97
|
+
# @return [Array]
|
98
|
+
#
|
99
|
+
def keys
|
100
|
+
synchronize { @keys }
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# @param [Mixed] key
|
105
|
+
# @return [TrueClass|FalseClass]
|
106
|
+
#
|
107
|
+
def key?(key)
|
108
|
+
synchronize { @cache.key?(key) }
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Removes all keys from the cache.
|
113
|
+
#
|
114
|
+
def clear
|
115
|
+
synchronize do
|
116
|
+
@keys.clear
|
117
|
+
@cache.clear
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
##
|
122
|
+
# @return [Fixnum]
|
123
|
+
#
|
124
|
+
def size
|
125
|
+
synchronize { @cache.size }
|
126
|
+
end
|
127
|
+
|
128
|
+
alias_method :length, :size
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
##
|
133
|
+
# Yields the supplied block in a synchronized manner (if needed). This
|
134
|
+
# method is heavily based on `MonitorMixin#mon_enter`.
|
135
|
+
#
|
136
|
+
def synchronize
|
137
|
+
if @owner != Thread.current
|
138
|
+
@mutex.synchronize do
|
139
|
+
@owner = Thread.current
|
140
|
+
|
141
|
+
yield
|
142
|
+
end
|
143
|
+
else
|
144
|
+
yield
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
##
|
149
|
+
# Removes old keys until the size of the hash no longer exceeds the maximum
|
150
|
+
# size.
|
151
|
+
#
|
152
|
+
def resize
|
153
|
+
return unless size > @maximum
|
154
|
+
|
155
|
+
to_remove = @keys.shift(size - @maximum)
|
156
|
+
|
157
|
+
to_remove.each { |key| @cache.delete(key) }
|
158
|
+
end
|
159
|
+
end # LRU
|
160
|
+
end # Oga
|
data/lib/oga/oga.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Parses the given XML document.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# document = Oga.parse_xml('<root>Hello</root>')
|
7
|
+
#
|
8
|
+
# @see [Oga::XML::Lexer#initialize]
|
9
|
+
#
|
10
|
+
# @return [Oga::XML::Document]
|
11
|
+
#
|
12
|
+
def self.parse_xml(xml, options = {})
|
13
|
+
XML::Parser.new(xml, options).parse
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Parses the given HTML document.
|
18
|
+
#
|
19
|
+
# @example
|
20
|
+
# document = Oga.parse_html('<html>...</html>')
|
21
|
+
#
|
22
|
+
# @see [Oga::XML::Lexer#initialize]
|
23
|
+
#
|
24
|
+
# @return [Oga::XML::Document]
|
25
|
+
#
|
26
|
+
def self.parse_html(html, options = {})
|
27
|
+
HTML::Parser.new(html, options).parse
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Parses the given XML document using the SAX parser.
|
32
|
+
#
|
33
|
+
# @example
|
34
|
+
# handler = SomeSaxHandler.new
|
35
|
+
#
|
36
|
+
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
37
|
+
#
|
38
|
+
# @see [Oga::XML::SaxParser#initialize]
|
39
|
+
#
|
40
|
+
def self.sax_parse_xml(handler, xml, options = {})
|
41
|
+
XML::SaxParser.new(handler, xml, options).parse
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Parses the given HTML document using the SAX parser.
|
46
|
+
#
|
47
|
+
# @example
|
48
|
+
# handler = SomeSaxHandler.new
|
49
|
+
#
|
50
|
+
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
51
|
+
#
|
52
|
+
# @see [Oga::XML::SaxParser#initialize]
|
53
|
+
#
|
54
|
+
def self.sax_parse_html(handler, html, options = {})
|
55
|
+
HTML::SaxParser.new(handler, html, options).parse
|
56
|
+
end
|
57
|
+
end # Oga
|
data/lib/oga/version.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# @api private
|
4
|
+
#
|
5
|
+
class Whitelist < Blacklist
|
6
|
+
##
|
7
|
+
# @return [TrueClass|FalseClass]
|
8
|
+
#
|
9
|
+
def allow?(name)
|
10
|
+
names.include?(name)
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# @return [Oga::Blacklist]
|
15
|
+
#
|
16
|
+
def to_blacklist
|
17
|
+
Blacklist.new(names)
|
18
|
+
end
|
19
|
+
end # Whitelist
|
20
|
+
end # Oga
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class for storing information about a single XML attribute.
|
5
|
+
#
|
6
|
+
class Attribute
|
7
|
+
# The name of the attribute.
|
8
|
+
# @return [String]
|
9
|
+
attr_accessor :name
|
10
|
+
|
11
|
+
# @return [String]
|
12
|
+
attr_accessor :namespace_name
|
13
|
+
|
14
|
+
# The element this attribute belongs to.
|
15
|
+
# @return [Oga::XML::Element]
|
16
|
+
attr_accessor :element
|
17
|
+
|
18
|
+
##
|
19
|
+
# The default namespace available to all attributes. This namespace can
|
20
|
+
# not be modified.
|
21
|
+
#
|
22
|
+
# @return [Oga::XML::Namespace]
|
23
|
+
#
|
24
|
+
DEFAULT_NAMESPACE = Namespace.new(
|
25
|
+
:name => 'xml',
|
26
|
+
:uri => XML::DEFAULT_NAMESPACE.uri
|
27
|
+
).freeze
|
28
|
+
|
29
|
+
##
|
30
|
+
# @param [Hash] options
|
31
|
+
#
|
32
|
+
# @option options [String] :name
|
33
|
+
# @option options [String] :namespace_name
|
34
|
+
# @option options [String] :value
|
35
|
+
# @option options [Oga::XML::Element] :element
|
36
|
+
#
|
37
|
+
def initialize(options = {})
|
38
|
+
@name = options[:name]
|
39
|
+
@value = options[:value]
|
40
|
+
@element = options[:element]
|
41
|
+
|
42
|
+
@namespace_name = options[:namespace_name]
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Returns the {Oga::XML::Namespace} instance for the current namespace
|
47
|
+
# name.
|
48
|
+
#
|
49
|
+
# @return [Oga::XML::Namespace]
|
50
|
+
#
|
51
|
+
def namespace
|
52
|
+
unless @namespace
|
53
|
+
if namespace_name == DEFAULT_NAMESPACE.name
|
54
|
+
@namespace = DEFAULT_NAMESPACE
|
55
|
+
else
|
56
|
+
@namespace = element.available_namespaces[namespace_name]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
@namespace
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# @param [String] value
|
65
|
+
#
|
66
|
+
def value=(value)
|
67
|
+
@value = value
|
68
|
+
@decoded = false
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Returns the value of the attribute or nil if no explicit value was set.
|
73
|
+
#
|
74
|
+
# @return [String|NilClass]
|
75
|
+
#
|
76
|
+
def value
|
77
|
+
if !@decoded and @value
|
78
|
+
@value = EntityDecoder.try_decode(@value, html?)
|
79
|
+
@decoded = true
|
80
|
+
end
|
81
|
+
|
82
|
+
@value
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# @return [String]
|
87
|
+
#
|
88
|
+
def text
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :to_s, :text
|
93
|
+
|
94
|
+
##
|
95
|
+
# @return [String]
|
96
|
+
#
|
97
|
+
def to_xml
|
98
|
+
if namespace_name
|
99
|
+
full_name = "#{namespace_name}:#{name}"
|
100
|
+
else
|
101
|
+
full_name = name
|
102
|
+
end
|
103
|
+
|
104
|
+
enc_value = value ? Entities.encode_attribute(value) : nil
|
105
|
+
|
106
|
+
%Q(#{full_name}="#{enc_value}")
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# @return [String]
|
111
|
+
#
|
112
|
+
def inspect
|
113
|
+
segments = []
|
114
|
+
|
115
|
+
[:name, :namespace, :value].each do |attr|
|
116
|
+
value = send(attr)
|
117
|
+
|
118
|
+
if value
|
119
|
+
segments << "#{attr}: #{value.inspect}"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
"Attribute(#{segments.join(' ')})"
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
##
|
129
|
+
# @return [TrueClass|FalseClass]
|
130
|
+
#
|
131
|
+
def html?
|
132
|
+
!!@element && @element.html?
|
133
|
+
end
|
134
|
+
end # Attribute
|
135
|
+
end # XML
|
136
|
+
end # Oga
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about CDATA tags.
|
5
|
+
#
|
6
|
+
class Cdata < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
"<![CDATA[#{text}]]>"
|
14
|
+
end
|
15
|
+
end # Cdata
|
16
|
+
end # XML
|
17
|
+
end # Oga
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Base class for nodes that represent a text-like value such as Text and
|
5
|
+
# Comment nodes.
|
6
|
+
#
|
7
|
+
class CharacterNode < Node
|
8
|
+
# @return [String]
|
9
|
+
attr_accessor :text
|
10
|
+
|
11
|
+
##
|
12
|
+
# @param [Hash] options
|
13
|
+
#
|
14
|
+
# @option options [String] :text The text of the node.
|
15
|
+
#
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
|
19
|
+
@text = options[:text]
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# @return [String]
|
24
|
+
#
|
25
|
+
def to_xml
|
26
|
+
text.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
def inspect
|
33
|
+
"#{self.class.to_s.split('::').last}(#{text.inspect})"
|
34
|
+
end
|
35
|
+
end # CharacterNode
|
36
|
+
end # XML
|
37
|
+
end # Oga
|