rubyjedi-oga 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# Parser for processing HTML input. This parser is a small wrapper around
|
5
|
+
# {Oga::XML::Parser} and takes care of setting the various options required
|
6
|
+
# for parsing HTML documents.
|
7
|
+
#
|
8
|
+
# A basic example:
|
9
|
+
#
|
10
|
+
# Oga::HTML::Parser.new('<meta charset="utf-8">').parse
|
11
|
+
#
|
12
|
+
class Parser < XML::Parser
|
13
|
+
##
|
14
|
+
# @param [String|IO] data
|
15
|
+
# @param [Hash] options
|
16
|
+
# @see [Oga::XML::Parser#initialize]
|
17
|
+
#
|
18
|
+
def initialize(data, options = {})
|
19
|
+
options = options.merge(:html => true)
|
20
|
+
|
21
|
+
super(data, options)
|
22
|
+
end
|
23
|
+
end # Parser
|
24
|
+
end # HTML
|
25
|
+
end # Oga
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# SAX parser for HTML documents. See the documentation of
|
5
|
+
# {Oga::XML::SaxParser} for more information.
|
6
|
+
#
|
7
|
+
class SaxParser < XML::SaxParser
|
8
|
+
##
|
9
|
+
# @see [Oga::XML::SaxParser#initialize]
|
10
|
+
#
|
11
|
+
def initialize(handler, data, options = {})
|
12
|
+
options = options.merge(:html => true)
|
13
|
+
|
14
|
+
super(handler, data, options)
|
15
|
+
end
|
16
|
+
end # SaxParser
|
17
|
+
end # HTML
|
18
|
+
end # Oga
|
data/lib/oga/lru.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Thread-safe LRU cache using a Hash as the underlying storage engine.
|
4
|
+
# Whenever the size of the cache exceeds the given limit the oldest keys are
|
5
|
+
# removed (base on insert order).
|
6
|
+
#
|
7
|
+
# This class uses its own list of keys (as returned by {LRU#keys}) instead of
|
8
|
+
# relying on `Hash#keys` as the latter allocates a new Array upon every call.
|
9
|
+
#
|
10
|
+
# This class doesn't use MonitorMixin due to the extra overhead it adds
|
11
|
+
# compared to using a Mutex directly.
|
12
|
+
#
|
13
|
+
# Example usage:
|
14
|
+
#
|
15
|
+
# cache = LRU.new(3)
|
16
|
+
#
|
17
|
+
# cache[:a] = 10
|
18
|
+
# cache[:b] = 20
|
19
|
+
# cache[:c] = 30
|
20
|
+
# cache[:d] = 40
|
21
|
+
#
|
22
|
+
# cache.keys # => [:b, :c, :d]
|
23
|
+
#
|
24
|
+
# @api private
|
25
|
+
#
|
26
|
+
class LRU
|
27
|
+
##
|
28
|
+
# @param [Fixnum] maximum
|
29
|
+
#
|
30
|
+
def initialize(maximum = 1024)
|
31
|
+
@maximum = maximum
|
32
|
+
@cache = {}
|
33
|
+
@keys = []
|
34
|
+
@mutex = Mutex.new
|
35
|
+
@owner = Thread.current
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @param [Fixnum] value
|
40
|
+
#
|
41
|
+
def maximum=(value)
|
42
|
+
synchronize do
|
43
|
+
@maximum = value
|
44
|
+
|
45
|
+
resize
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# @return [Fixnum]
|
51
|
+
#
|
52
|
+
def maximum
|
53
|
+
synchronize { @maximum }
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Returns the value of the key.
|
58
|
+
#
|
59
|
+
# @param [Mixed] key
|
60
|
+
# @return [Mixed]
|
61
|
+
#
|
62
|
+
def [](key)
|
63
|
+
synchronize { @cache[key] }
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Sets the key and its value. Old keys are discarded if the LRU size exceeds
|
68
|
+
# the limit.
|
69
|
+
#
|
70
|
+
# @param [Mixed] key
|
71
|
+
# @param [Mixed] value
|
72
|
+
#
|
73
|
+
def []=(key, value)
|
74
|
+
synchronize do
|
75
|
+
@cache[key] = value
|
76
|
+
|
77
|
+
@keys.delete(key) if @keys.include?(key)
|
78
|
+
|
79
|
+
@keys << key
|
80
|
+
|
81
|
+
resize
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# Returns a key if it exists, otherwise yields the supplied block and uses
|
87
|
+
# its return value as the key value.
|
88
|
+
#
|
89
|
+
# @param [Mixed] key
|
90
|
+
# @return [Mixed]
|
91
|
+
#
|
92
|
+
def get_or_set(key)
|
93
|
+
synchronize { self[key] ||= yield }
|
94
|
+
end
|
95
|
+
|
96
|
+
##
|
97
|
+
# @return [Array]
|
98
|
+
#
|
99
|
+
def keys
|
100
|
+
synchronize { @keys }
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# @param [Mixed] key
|
105
|
+
# @return [TrueClass|FalseClass]
|
106
|
+
#
|
107
|
+
def key?(key)
|
108
|
+
synchronize { @cache.key?(key) }
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Removes all keys from the cache.
|
113
|
+
#
|
114
|
+
def clear
|
115
|
+
synchronize do
|
116
|
+
@keys.clear
|
117
|
+
@cache.clear
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
##
|
122
|
+
# @return [Fixnum]
|
123
|
+
#
|
124
|
+
def size
|
125
|
+
synchronize { @cache.size }
|
126
|
+
end
|
127
|
+
|
128
|
+
alias_method :length, :size
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
##
|
133
|
+
# Yields the supplied block in a synchronized manner (if needed). This
|
134
|
+
# method is heavily based on `MonitorMixin#mon_enter`.
|
135
|
+
#
|
136
|
+
def synchronize
|
137
|
+
if @owner != Thread.current
|
138
|
+
@mutex.synchronize do
|
139
|
+
@owner = Thread.current
|
140
|
+
|
141
|
+
yield
|
142
|
+
end
|
143
|
+
else
|
144
|
+
yield
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
##
|
149
|
+
# Removes old keys until the size of the hash no longer exceeds the maximum
|
150
|
+
# size.
|
151
|
+
#
|
152
|
+
def resize
|
153
|
+
return unless size > @maximum
|
154
|
+
|
155
|
+
to_remove = @keys.shift(size - @maximum)
|
156
|
+
|
157
|
+
to_remove.each { |key| @cache.delete(key) }
|
158
|
+
end
|
159
|
+
end # LRU
|
160
|
+
end # Oga
|
data/lib/oga/oga.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# Parses the given XML document.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# document = Oga.parse_xml('<root>Hello</root>')
|
7
|
+
#
|
8
|
+
# @see [Oga::XML::Lexer#initialize]
|
9
|
+
#
|
10
|
+
# @return [Oga::XML::Document]
|
11
|
+
#
|
12
|
+
def self.parse_xml(xml, options = {})
|
13
|
+
XML::Parser.new(xml, options).parse
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Parses the given HTML document.
|
18
|
+
#
|
19
|
+
# @example
|
20
|
+
# document = Oga.parse_html('<html>...</html>')
|
21
|
+
#
|
22
|
+
# @see [Oga::XML::Lexer#initialize]
|
23
|
+
#
|
24
|
+
# @return [Oga::XML::Document]
|
25
|
+
#
|
26
|
+
def self.parse_html(html, options = {})
|
27
|
+
HTML::Parser.new(html, options).parse
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Parses the given XML document using the SAX parser.
|
32
|
+
#
|
33
|
+
# @example
|
34
|
+
# handler = SomeSaxHandler.new
|
35
|
+
#
|
36
|
+
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
37
|
+
#
|
38
|
+
# @see [Oga::XML::SaxParser#initialize]
|
39
|
+
#
|
40
|
+
def self.sax_parse_xml(handler, xml, options = {})
|
41
|
+
XML::SaxParser.new(handler, xml, options).parse
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Parses the given HTML document using the SAX parser.
|
46
|
+
#
|
47
|
+
# @example
|
48
|
+
# handler = SomeSaxHandler.new
|
49
|
+
#
|
50
|
+
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
51
|
+
#
|
52
|
+
# @see [Oga::XML::SaxParser#initialize]
|
53
|
+
#
|
54
|
+
def self.sax_parse_html(handler, html, options = {})
|
55
|
+
HTML::SaxParser.new(handler, html, options).parse
|
56
|
+
end
|
57
|
+
end # Oga
|
data/lib/oga/version.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Oga
|
2
|
+
##
|
3
|
+
# @api private
|
4
|
+
#
|
5
|
+
class Whitelist < Blacklist
|
6
|
+
##
|
7
|
+
# @return [TrueClass|FalseClass]
|
8
|
+
#
|
9
|
+
def allow?(name)
|
10
|
+
names.include?(name)
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# @return [Oga::Blacklist]
|
15
|
+
#
|
16
|
+
def to_blacklist
|
17
|
+
Blacklist.new(names)
|
18
|
+
end
|
19
|
+
end # Whitelist
|
20
|
+
end # Oga
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class for storing information about a single XML attribute.
|
5
|
+
#
|
6
|
+
class Attribute
|
7
|
+
# The name of the attribute.
|
8
|
+
# @return [String]
|
9
|
+
attr_accessor :name
|
10
|
+
|
11
|
+
# @return [String]
|
12
|
+
attr_accessor :namespace_name
|
13
|
+
|
14
|
+
# The element this attribute belongs to.
|
15
|
+
# @return [Oga::XML::Element]
|
16
|
+
attr_accessor :element
|
17
|
+
|
18
|
+
##
|
19
|
+
# The default namespace available to all attributes. This namespace can
|
20
|
+
# not be modified.
|
21
|
+
#
|
22
|
+
# @return [Oga::XML::Namespace]
|
23
|
+
#
|
24
|
+
DEFAULT_NAMESPACE = Namespace.new(
|
25
|
+
:name => 'xml',
|
26
|
+
:uri => XML::DEFAULT_NAMESPACE.uri
|
27
|
+
).freeze
|
28
|
+
|
29
|
+
##
|
30
|
+
# @param [Hash] options
|
31
|
+
#
|
32
|
+
# @option options [String] :name
|
33
|
+
# @option options [String] :namespace_name
|
34
|
+
# @option options [String] :value
|
35
|
+
# @option options [Oga::XML::Element] :element
|
36
|
+
#
|
37
|
+
def initialize(options = {})
|
38
|
+
@name = options[:name]
|
39
|
+
@value = options[:value]
|
40
|
+
@element = options[:element]
|
41
|
+
|
42
|
+
@namespace_name = options[:namespace_name]
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Returns the {Oga::XML::Namespace} instance for the current namespace
|
47
|
+
# name.
|
48
|
+
#
|
49
|
+
# @return [Oga::XML::Namespace]
|
50
|
+
#
|
51
|
+
def namespace
|
52
|
+
unless @namespace
|
53
|
+
if namespace_name == DEFAULT_NAMESPACE.name
|
54
|
+
@namespace = DEFAULT_NAMESPACE
|
55
|
+
else
|
56
|
+
@namespace = element.available_namespaces[namespace_name]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
@namespace
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# @param [String] value
|
65
|
+
#
|
66
|
+
def value=(value)
|
67
|
+
@value = value
|
68
|
+
@decoded = false
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Returns the value of the attribute or nil if no explicit value was set.
|
73
|
+
#
|
74
|
+
# @return [String|NilClass]
|
75
|
+
#
|
76
|
+
def value
|
77
|
+
if !@decoded and @value
|
78
|
+
@value = EntityDecoder.try_decode(@value, html?)
|
79
|
+
@decoded = true
|
80
|
+
end
|
81
|
+
|
82
|
+
@value
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# @return [String]
|
87
|
+
#
|
88
|
+
def text
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :to_s, :text
|
93
|
+
|
94
|
+
##
|
95
|
+
# @return [String]
|
96
|
+
#
|
97
|
+
def to_xml
|
98
|
+
if namespace_name
|
99
|
+
full_name = "#{namespace_name}:#{name}"
|
100
|
+
else
|
101
|
+
full_name = name
|
102
|
+
end
|
103
|
+
|
104
|
+
enc_value = value ? Entities.encode_attribute(value) : nil
|
105
|
+
|
106
|
+
%Q(#{full_name}="#{enc_value}")
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# @return [String]
|
111
|
+
#
|
112
|
+
def inspect
|
113
|
+
segments = []
|
114
|
+
|
115
|
+
[:name, :namespace, :value].each do |attr|
|
116
|
+
value = send(attr)
|
117
|
+
|
118
|
+
if value
|
119
|
+
segments << "#{attr}: #{value.inspect}"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
"Attribute(#{segments.join(' ')})"
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
##
|
129
|
+
# @return [TrueClass|FalseClass]
|
130
|
+
#
|
131
|
+
def html?
|
132
|
+
!!@element && @element.html?
|
133
|
+
end
|
134
|
+
end # Attribute
|
135
|
+
end # XML
|
136
|
+
end # Oga
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about CDATA tags.
|
5
|
+
#
|
6
|
+
class Cdata < CharacterNode
|
7
|
+
##
|
8
|
+
# Converts the node back to XML.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
#
|
12
|
+
def to_xml
|
13
|
+
"<![CDATA[#{text}]]>"
|
14
|
+
end
|
15
|
+
end # Cdata
|
16
|
+
end # XML
|
17
|
+
end # Oga
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Base class for nodes that represent a text-like value such as Text and
|
5
|
+
# Comment nodes.
|
6
|
+
#
|
7
|
+
class CharacterNode < Node
|
8
|
+
# @return [String]
|
9
|
+
attr_accessor :text
|
10
|
+
|
11
|
+
##
|
12
|
+
# @param [Hash] options
|
13
|
+
#
|
14
|
+
# @option options [String] :text The text of the node.
|
15
|
+
#
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
|
19
|
+
@text = options[:text]
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# @return [String]
|
24
|
+
#
|
25
|
+
def to_xml
|
26
|
+
text.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
def inspect
|
33
|
+
"#{self.class.to_s.split('::').last}(#{text.inspect})"
|
34
|
+
end
|
35
|
+
end # CharacterNode
|
36
|
+
end # XML
|
37
|
+
end # Oga
|