oga 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +19 -0
- data/README.md +171 -0
- data/doc/DCO.md +25 -0
- data/doc/changelog.md +7 -0
- data/doc/css/common.css +76 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/ext/c/extconf.rb +13 -0
- data/ext/c/lexer.c +1518 -0
- data/ext/c/lexer.h +8 -0
- data/ext/c/lexer.rl +121 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +829 -0
- data/ext/java/org/liboga/xml/Lexer.rl +151 -0
- data/ext/ragel/base_lexer.rl +323 -0
- data/lib/oga.rb +43 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/oga.rb +27 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/xml/attribute.rb +111 -0
- data/lib/oga/xml/cdata.rb +24 -0
- data/lib/oga/xml/character_node.rb +39 -0
- data/lib/oga/xml/comment.rb +24 -0
- data/lib/oga/xml/doctype.rb +91 -0
- data/lib/oga/xml/document.rb +99 -0
- data/lib/oga/xml/element.rb +340 -0
- data/lib/oga/xml/lexer.rb +399 -0
- data/lib/oga/xml/namespace.rb +42 -0
- data/lib/oga/xml/node.rb +175 -0
- data/lib/oga/xml/node_set.rb +313 -0
- data/lib/oga/xml/parser.rb +556 -0
- data/lib/oga/xml/processing_instruction.rb +39 -0
- data/lib/oga/xml/pull_parser.rb +166 -0
- data/lib/oga/xml/querying.rb +32 -0
- data/lib/oga/xml/text.rb +16 -0
- data/lib/oga/xml/traversal.rb +48 -0
- data/lib/oga/xml/xml_declaration.rb +76 -0
- data/lib/oga/xpath/evaluator.rb +1748 -0
- data/lib/oga/xpath/lexer.rb +2043 -0
- data/lib/oga/xpath/node.rb +10 -0
- data/lib/oga/xpath/parser.rb +535 -0
- data/oga.gemspec +45 -0
- metadata +221 -0
@@ -0,0 +1,151 @@
|
|
1
|
+
package org.liboga.xml;
|
2
|
+
|
3
|
+
%%machine java_lexer;
|
4
|
+
|
5
|
+
import java.io.IOException;
|
6
|
+
|
7
|
+
import org.jcodings.Encoding;
|
8
|
+
|
9
|
+
import org.jruby.Ruby;
|
10
|
+
import org.jruby.RubyModule;
|
11
|
+
import org.jruby.RubyClass;
|
12
|
+
import org.jruby.RubyObject;
|
13
|
+
import org.jruby.RubyString;
|
14
|
+
import org.jruby.RubyFixnum;
|
15
|
+
import org.jruby.util.ByteList;
|
16
|
+
import org.jruby.anno.JRubyClass;
|
17
|
+
import org.jruby.anno.JRubyMethod;
|
18
|
+
import org.jruby.runtime.ThreadContext;
|
19
|
+
import org.jruby.runtime.ObjectAllocator;
|
20
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
21
|
+
|
22
|
+
/**
|
23
|
+
* Lexer support class for JRuby.
|
24
|
+
*
|
25
|
+
* The Lexer class contains the raw Ragel loop and calls back in to Ruby land
|
26
|
+
* whenever a Ragel action is needed similar to the C extension setup.
|
27
|
+
*
|
28
|
+
* This class requires Ruby land to first define the `Oga::XML` namespace.
|
29
|
+
*/
|
30
|
+
@JRubyClass(name="Oga::XML::Lexer", parent="Object")
|
31
|
+
public class Lexer extends RubyObject
|
32
|
+
{
|
33
|
+
/**
|
34
|
+
* The current Ruby runtime.
|
35
|
+
*/
|
36
|
+
private Ruby runtime;
|
37
|
+
|
38
|
+
%% write data;
|
39
|
+
|
40
|
+
/* Used by Ragel to keep track of the current state. */
|
41
|
+
int act;
|
42
|
+
int cs;
|
43
|
+
|
44
|
+
/**
|
45
|
+
* Sets up the current class in the Ruby runtime.
|
46
|
+
*/
|
47
|
+
public static void load(Ruby runtime)
|
48
|
+
{
|
49
|
+
RubyModule xml = (RubyModule) runtime.getModule("Oga")
|
50
|
+
.getConstant("XML");
|
51
|
+
|
52
|
+
RubyClass lexer = xml.defineClassUnder(
|
53
|
+
"Lexer",
|
54
|
+
runtime.getObject(),
|
55
|
+
ALLOCATOR
|
56
|
+
);
|
57
|
+
|
58
|
+
lexer.defineAnnotatedMethods(Lexer.class);
|
59
|
+
}
|
60
|
+
|
61
|
+
private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
|
62
|
+
{
|
63
|
+
public IRubyObject allocate(Ruby runtime, RubyClass klass)
|
64
|
+
{
|
65
|
+
return new org.liboga.xml.Lexer(runtime, klass);
|
66
|
+
}
|
67
|
+
};
|
68
|
+
|
69
|
+
public Lexer(Ruby runtime, RubyClass klass)
|
70
|
+
{
|
71
|
+
super(runtime, klass);
|
72
|
+
|
73
|
+
this.runtime = runtime;
|
74
|
+
}
|
75
|
+
|
76
|
+
/**
|
77
|
+
* Runs the bulk of the Ragel loop and calls back in to Ruby.
|
78
|
+
*
|
79
|
+
* This method pulls its data in from the instance variable `@data`. The
|
80
|
+
* Ruby side of the Lexer class should set this variable to a String in its
|
81
|
+
* constructor method. Encodings are passed along to make sure that token
|
82
|
+
* values share the same encoding as the input.
|
83
|
+
*
|
84
|
+
* This method always returns nil.
|
85
|
+
*/
|
86
|
+
@JRubyMethod
|
87
|
+
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
88
|
+
{
|
89
|
+
Encoding encoding = rb_str.getEncoding();
|
90
|
+
|
91
|
+
byte[] data = rb_str.getBytes();
|
92
|
+
|
93
|
+
int ts = 0;
|
94
|
+
int te = 0;
|
95
|
+
int p = 0;
|
96
|
+
int mark = 0;
|
97
|
+
int pe = data.length;
|
98
|
+
int eof = data.length;
|
99
|
+
|
100
|
+
%% write exec;
|
101
|
+
|
102
|
+
return context.nil;
|
103
|
+
}
|
104
|
+
|
105
|
+
/**
|
106
|
+
* Resets the internal state of the lexer.
|
107
|
+
*/
|
108
|
+
@JRubyMethod
|
109
|
+
public IRubyObject reset_native(ThreadContext context)
|
110
|
+
{
|
111
|
+
this.act = 0;
|
112
|
+
this.cs = java_lexer_start;
|
113
|
+
|
114
|
+
return context.nil;
|
115
|
+
}
|
116
|
+
|
117
|
+
/**
|
118
|
+
* Calls back in to Ruby land passing the current token value along.
|
119
|
+
*
|
120
|
+
* This method calls back in to Ruby land based on the method name
|
121
|
+
* specified in `name`. The Ruby callback should take one argument. This
|
122
|
+
* argument will be a String containing the value of the current token.
|
123
|
+
*/
|
124
|
+
public void callback(String name, byte[] data, Encoding enc, int ts, int te)
|
125
|
+
{
|
126
|
+
ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
|
127
|
+
|
128
|
+
RubyString value = this.runtime.newString(bytelist);
|
129
|
+
|
130
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
131
|
+
|
132
|
+
this.callMethod(context, name, value);
|
133
|
+
}
|
134
|
+
|
135
|
+
/**
|
136
|
+
* Calls back in to Ruby land without passing any arguments.
|
137
|
+
*/
|
138
|
+
public void callback_simple(String name)
|
139
|
+
{
|
140
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
141
|
+
|
142
|
+
this.callMethod(context, name);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
%%{
|
147
|
+
variable act this.act;
|
148
|
+
variable cs this.cs;
|
149
|
+
|
150
|
+
include base_lexer "base_lexer.rl";
|
151
|
+
}%%
|
@@ -0,0 +1,323 @@
|
|
1
|
+
%%machine base_lexer;
|
2
|
+
|
3
|
+
%%{
|
4
|
+
##
|
5
|
+
# Base grammar for the XML lexer.
|
6
|
+
#
|
7
|
+
# This grammar is shared between the C and Java extensions. As a result of
|
8
|
+
# this you should **not** include language specific code in Ragel
|
9
|
+
# actions/callbacks.
|
10
|
+
#
|
11
|
+
# To call back in to Ruby you can use one of the following two functions:
|
12
|
+
#
|
13
|
+
# * callback
|
14
|
+
# * callback_simple
|
15
|
+
#
|
16
|
+
# The first function takes 5 arguments:
|
17
|
+
#
|
18
|
+
# * The name of the Ruby method to call.
|
19
|
+
# * The input data.
|
20
|
+
# * The encoding of the input data.
|
21
|
+
# * The start of the current buffer.
|
22
|
+
# * The end of the current buffer.
|
23
|
+
#
|
24
|
+
# The function callback_simple only takes one argument: the name of the
|
25
|
+
# method to call. This function should be used for callbacks that don't
|
26
|
+
# require any values.
|
27
|
+
#
|
28
|
+
# When you call a method in Ruby make sure that said method is defined as
|
29
|
+
# an instance method in the `Oga::XML::Lexer` class.
|
30
|
+
#
|
31
|
+
# ## Machine Transitions
|
32
|
+
#
|
33
|
+
# To transition from one machine to another always use `fnext` instead of
|
34
|
+
# `fcall` and `fret`. This removes the need for the code to keep track of a
|
35
|
+
# stack.
|
36
|
+
#
|
37
|
+
|
38
|
+
newline = '\n' | '\r\n';
|
39
|
+
whitespace = [ \t];
|
40
|
+
ident_char = [a-zA-Z0-9\-_];
|
41
|
+
identifier = ident_char+;
|
42
|
+
|
43
|
+
# Comments
|
44
|
+
#
|
45
|
+
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
46
|
+
#
|
47
|
+
# Unlike the W3 specification these rules *do* allow character sequences
|
48
|
+
# such as `--` and `->`. Putting extra checks in for these sequences would
|
49
|
+
# actually make the rules/actions more complex.
|
50
|
+
#
|
51
|
+
|
52
|
+
comment_start = '<!--';
|
53
|
+
comment_end = '-->';
|
54
|
+
comment = comment_start (any* -- comment_end) comment_end;
|
55
|
+
|
56
|
+
action start_comment {
|
57
|
+
callback("on_comment", data, encoding, ts + 4, te - 3);
|
58
|
+
}
|
59
|
+
|
60
|
+
# CDATA
|
61
|
+
#
|
62
|
+
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
63
|
+
#
|
64
|
+
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
65
|
+
# support them but treats their contents as plain text.
|
66
|
+
#
|
67
|
+
|
68
|
+
cdata_start = '<![CDATA[';
|
69
|
+
cdata_end = ']]>';
|
70
|
+
cdata = cdata_start (any* -- cdata_end) cdata_end;
|
71
|
+
|
72
|
+
action start_cdata {
|
73
|
+
callback("on_cdata", data, encoding, ts + 9, te - 3);
|
74
|
+
}
|
75
|
+
|
76
|
+
# Processing Instructions
|
77
|
+
#
|
78
|
+
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
79
|
+
# http://en.wikipedia.org/wiki/Processing_Instruction
|
80
|
+
#
|
81
|
+
# These are tags meant to be used by parsers/libraries for custom behaviour.
|
82
|
+
# One example are the tags used by PHP: <?php and ?>. Note that the XML
|
83
|
+
# declaration tags (<?xml ?>) are not considered to be a processing
|
84
|
+
# instruction.
|
85
|
+
#
|
86
|
+
|
87
|
+
proc_ins_start = '<?' identifier;
|
88
|
+
proc_ins_end = '?>';
|
89
|
+
|
90
|
+
action start_proc_ins {
|
91
|
+
callback_simple("on_proc_ins_start");
|
92
|
+
callback("on_proc_ins_name", data, encoding, ts + 2, te);
|
93
|
+
|
94
|
+
mark = te;
|
95
|
+
|
96
|
+
fnext proc_ins_body;
|
97
|
+
}
|
98
|
+
|
99
|
+
proc_ins_body := |*
|
100
|
+
proc_ins_end => {
|
101
|
+
callback("on_text", data, encoding, mark, ts);
|
102
|
+
callback_simple("on_proc_ins_end");
|
103
|
+
|
104
|
+
fnext main;
|
105
|
+
};
|
106
|
+
|
107
|
+
any;
|
108
|
+
*|;
|
109
|
+
|
110
|
+
# Strings
|
111
|
+
#
|
112
|
+
# Strings in HTML can either be single or double quoted. If a string
|
113
|
+
# starts with one of these quotes it must be closed with the same type
|
114
|
+
# of quote.
|
115
|
+
#
|
116
|
+
dquote = '"';
|
117
|
+
squote = "'";
|
118
|
+
|
119
|
+
string_dquote = (dquote ^dquote* dquote);
|
120
|
+
string_squote = (squote ^squote* squote);
|
121
|
+
|
122
|
+
string = string_dquote | string_squote;
|
123
|
+
|
124
|
+
action emit_string {
|
125
|
+
callback("on_string", data, encoding, ts + 1, te - 1);
|
126
|
+
}
|
127
|
+
|
128
|
+
# DOCTYPES
|
129
|
+
#
|
130
|
+
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
131
|
+
#
|
132
|
+
# These rules support the 3 flavours of doctypes:
|
133
|
+
#
|
134
|
+
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
135
|
+
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
136
|
+
# 3. Legacy doctypes
|
137
|
+
#
|
138
|
+
doctype_start = '<!DOCTYPE'i whitespace+;
|
139
|
+
|
140
|
+
action start_doctype {
|
141
|
+
callback_simple("on_doctype_start");
|
142
|
+
fnext doctype;
|
143
|
+
}
|
144
|
+
|
145
|
+
# Machine for processing doctypes. Doctype values such as the public
|
146
|
+
# and system IDs are treated as T_STRING tokens.
|
147
|
+
doctype := |*
|
148
|
+
'PUBLIC' | 'SYSTEM' => {
|
149
|
+
callback("on_doctype_type", data, encoding, ts, te);
|
150
|
+
};
|
151
|
+
|
152
|
+
# Consumes everything between the [ and ]. Due to the use of :> the ]
|
153
|
+
# is not consumed by any+.
|
154
|
+
'[' any+ :> ']' => {
|
155
|
+
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
|
156
|
+
};
|
157
|
+
|
158
|
+
# Lex the public/system IDs as regular strings.
|
159
|
+
string => emit_string;
|
160
|
+
|
161
|
+
# Whitespace inside doctypes is ignored since there's no point in
|
162
|
+
# including it.
|
163
|
+
whitespace;
|
164
|
+
|
165
|
+
identifier => {
|
166
|
+
callback("on_doctype_name", data, encoding, ts, te);
|
167
|
+
};
|
168
|
+
|
169
|
+
'>' => {
|
170
|
+
callback_simple("on_doctype_end");
|
171
|
+
fnext main;
|
172
|
+
};
|
173
|
+
*|;
|
174
|
+
|
175
|
+
# XML declaration tags
|
176
|
+
#
|
177
|
+
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
178
|
+
#
|
179
|
+
xml_decl_start = '<?xml';
|
180
|
+
xml_decl_end = '?>';
|
181
|
+
|
182
|
+
action start_xml_decl {
|
183
|
+
callback_simple("on_xml_decl_start");
|
184
|
+
fnext xml_decl;
|
185
|
+
}
|
186
|
+
|
187
|
+
# Machine that processes the contents of an XML declaration tag.
|
188
|
+
xml_decl := |*
|
189
|
+
xml_decl_end => {
|
190
|
+
callback_simple("on_xml_decl_end");
|
191
|
+
fnext main;
|
192
|
+
};
|
193
|
+
|
194
|
+
# Attributes and their values (e.g. version="1.0").
|
195
|
+
identifier => {
|
196
|
+
callback("on_attribute", data, encoding, ts, te);
|
197
|
+
};
|
198
|
+
|
199
|
+
string => emit_string;
|
200
|
+
|
201
|
+
any;
|
202
|
+
*|;
|
203
|
+
|
204
|
+
# Elements
|
205
|
+
#
|
206
|
+
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
207
|
+
#
|
208
|
+
# Lexing of elements is broken up into different machines that handle the
|
209
|
+
# name/namespace, contents of the open tag and the body of an element. The
|
210
|
+
# body of an element is lexed using the `main` machine.
|
211
|
+
#
|
212
|
+
|
213
|
+
element_start = '<' ident_char;
|
214
|
+
element_end = '</' identifier (':' identifier)* '>';
|
215
|
+
|
216
|
+
action start_element {
|
217
|
+
callback_simple("on_element_start");
|
218
|
+
fhold;
|
219
|
+
fnext element_name;
|
220
|
+
}
|
221
|
+
|
222
|
+
action close_element {
|
223
|
+
callback_simple("on_element_end");
|
224
|
+
}
|
225
|
+
|
226
|
+
# Machine used for lexing the name/namespace of an element.
|
227
|
+
element_name := |*
|
228
|
+
identifier ':' => {
|
229
|
+
callback("on_element_ns", data, encoding, ts, te - 1);
|
230
|
+
};
|
231
|
+
|
232
|
+
identifier => {
|
233
|
+
callback("on_element_name", data, encoding, ts, te);
|
234
|
+
fnext element_head;
|
235
|
+
};
|
236
|
+
*|;
|
237
|
+
|
238
|
+
# Machine used for processing the contents of an element's starting tag.
|
239
|
+
# This includes the name, namespace and attributes.
|
240
|
+
element_head := |*
|
241
|
+
whitespace | '=';
|
242
|
+
|
243
|
+
newline => {
|
244
|
+
callback_simple("advance_line");
|
245
|
+
};
|
246
|
+
|
247
|
+
# Attribute names and namespaces.
|
248
|
+
identifier ':' => {
|
249
|
+
callback("on_attribute_ns", data, encoding, ts, te - 1);
|
250
|
+
};
|
251
|
+
|
252
|
+
identifier => {
|
253
|
+
callback("on_attribute", data, encoding, ts, te);
|
254
|
+
};
|
255
|
+
|
256
|
+
# Attribute values.
|
257
|
+
string => emit_string;
|
258
|
+
|
259
|
+
# We're done with the open tag of the element.
|
260
|
+
'>' => {
|
261
|
+
callback_simple("on_element_open_end");
|
262
|
+
fnext main;
|
263
|
+
};
|
264
|
+
|
265
|
+
# Self closing tags.
|
266
|
+
'/>' => {
|
267
|
+
callback_simple("on_element_end");
|
268
|
+
fnext main;
|
269
|
+
};
|
270
|
+
*|;
|
271
|
+
|
272
|
+
# Text
|
273
|
+
#
|
274
|
+
# http://www.w3.org/TR/xml/#syntax
|
275
|
+
# http://www.w3.org/TR/html-markup/syntax.html#text-syntax
|
276
|
+
#
|
277
|
+
# Text content is everything leading up to certain special tags such as "</"
|
278
|
+
# and "<?".
|
279
|
+
|
280
|
+
action start_text {
|
281
|
+
fhold;
|
282
|
+
fnext text;
|
283
|
+
}
|
284
|
+
|
285
|
+
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
|
286
|
+
# back to the main machine.
|
287
|
+
#
|
288
|
+
# Note that this only works if each sequence is exactly 2 characters
|
289
|
+
# long. Because of this "<!" is used instead of "<!--".
|
290
|
+
|
291
|
+
terminate_text = '</' | '<!' | '<?' | element_start;
|
292
|
+
allowed_text = any* -- terminate_text;
|
293
|
+
|
294
|
+
text := |*
|
295
|
+
# Text followed by a special tag, such as "foo<!--"
|
296
|
+
allowed_text @{ mark = p; } terminate_text => {
|
297
|
+
callback("on_text", data, encoding, ts, mark);
|
298
|
+
|
299
|
+
p = mark - 1;
|
300
|
+
mark = 0;
|
301
|
+
|
302
|
+
fnext main;
|
303
|
+
};
|
304
|
+
|
305
|
+
# Just regular text.
|
306
|
+
allowed_text => {
|
307
|
+
callback("on_text", data, encoding, ts, te);
|
308
|
+
fnext main;
|
309
|
+
};
|
310
|
+
*|;
|
311
|
+
|
312
|
+
# The main machine aka the entry point of Ragel.
|
313
|
+
main := |*
|
314
|
+
doctype_start => start_doctype;
|
315
|
+
xml_decl_start => start_xml_decl;
|
316
|
+
comment => start_comment;
|
317
|
+
cdata => start_cdata;
|
318
|
+
proc_ins_start => start_proc_ins;
|
319
|
+
element_start => start_element;
|
320
|
+
element_end => close_element;
|
321
|
+
any => start_text;
|
322
|
+
*|;
|
323
|
+
}%%
|