oga 0.1.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +19 -0
- data/README.md +179 -0
- data/doc/DCO.md +25 -0
- data/doc/changelog.md +20 -0
- data/doc/css/common.css +76 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/ext/c/extconf.rb +13 -0
- data/ext/c/lexer.c +1518 -0
- data/ext/c/lexer.h +8 -0
- data/ext/c/lexer.rl +121 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +829 -0
- data/ext/java/org/liboga/xml/Lexer.rl +151 -0
- data/ext/ragel/base_lexer.rl +323 -0
- data/lib/liboga.jar +0 -0
- data/lib/oga.rb +43 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/oga.rb +27 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/xml/attribute.rb +111 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +39 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/doctype.rb +84 -0
- data/lib/oga/xml/document.rb +99 -0
- data/lib/oga/xml/element.rb +331 -0
- data/lib/oga/xml/lexer.rb +399 -0
- data/lib/oga/xml/namespace.rb +42 -0
- data/lib/oga/xml/node.rb +168 -0
- data/lib/oga/xml/node_set.rb +313 -0
- data/lib/oga/xml/parser.rb +556 -0
- data/lib/oga/xml/processing_instruction.rb +39 -0
- data/lib/oga/xml/pull_parser.rb +180 -0
- data/lib/oga/xml/querying.rb +32 -0
- data/lib/oga/xml/text.rb +11 -0
- data/lib/oga/xml/traversal.rb +48 -0
- data/lib/oga/xml/xml_declaration.rb +69 -0
- data/lib/oga/xpath/evaluator.rb +1748 -0
- data/lib/oga/xpath/lexer.rb +2043 -0
- data/lib/oga/xpath/node.rb +10 -0
- data/lib/oga/xpath/parser.rb +537 -0
- data/oga.gemspec +45 -0
- metadata +221 -0
@@ -0,0 +1,151 @@
|
|
1
|
+
package org.liboga.xml;
|
2
|
+
|
3
|
+
%%machine java_lexer;
|
4
|
+
|
5
|
+
import java.io.IOException;
|
6
|
+
|
7
|
+
import org.jcodings.Encoding;
|
8
|
+
|
9
|
+
import org.jruby.Ruby;
|
10
|
+
import org.jruby.RubyModule;
|
11
|
+
import org.jruby.RubyClass;
|
12
|
+
import org.jruby.RubyObject;
|
13
|
+
import org.jruby.RubyString;
|
14
|
+
import org.jruby.RubyFixnum;
|
15
|
+
import org.jruby.util.ByteList;
|
16
|
+
import org.jruby.anno.JRubyClass;
|
17
|
+
import org.jruby.anno.JRubyMethod;
|
18
|
+
import org.jruby.runtime.ThreadContext;
|
19
|
+
import org.jruby.runtime.ObjectAllocator;
|
20
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
21
|
+
|
22
|
+
/**
|
23
|
+
* Lexer support class for JRuby.
|
24
|
+
*
|
25
|
+
* The Lexer class contains the raw Ragel loop and calls back in to Ruby land
|
26
|
+
* whenever a Ragel action is needed similar to the C extension setup.
|
27
|
+
*
|
28
|
+
* This class requires Ruby land to first define the `Oga::XML` namespace.
|
29
|
+
*/
|
30
|
+
@JRubyClass(name="Oga::XML::Lexer", parent="Object")
|
31
|
+
public class Lexer extends RubyObject
|
32
|
+
{
|
33
|
+
/**
|
34
|
+
* The current Ruby runtime.
|
35
|
+
*/
|
36
|
+
private Ruby runtime;
|
37
|
+
|
38
|
+
%% write data;
|
39
|
+
|
40
|
+
/* Used by Ragel to keep track of the current state. */
|
41
|
+
int act;
|
42
|
+
int cs;
|
43
|
+
|
44
|
+
/**
|
45
|
+
* Sets up the current class in the Ruby runtime.
|
46
|
+
*/
|
47
|
+
public static void load(Ruby runtime)
|
48
|
+
{
|
49
|
+
RubyModule xml = (RubyModule) runtime.getModule("Oga")
|
50
|
+
.getConstant("XML");
|
51
|
+
|
52
|
+
RubyClass lexer = xml.defineClassUnder(
|
53
|
+
"Lexer",
|
54
|
+
runtime.getObject(),
|
55
|
+
ALLOCATOR
|
56
|
+
);
|
57
|
+
|
58
|
+
lexer.defineAnnotatedMethods(Lexer.class);
|
59
|
+
}
|
60
|
+
|
61
|
+
private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
|
62
|
+
{
|
63
|
+
public IRubyObject allocate(Ruby runtime, RubyClass klass)
|
64
|
+
{
|
65
|
+
return new org.liboga.xml.Lexer(runtime, klass);
|
66
|
+
}
|
67
|
+
};
|
68
|
+
|
69
|
+
public Lexer(Ruby runtime, RubyClass klass)
|
70
|
+
{
|
71
|
+
super(runtime, klass);
|
72
|
+
|
73
|
+
this.runtime = runtime;
|
74
|
+
}
|
75
|
+
|
76
|
+
/**
|
77
|
+
* Runs the bulk of the Ragel loop and calls back in to Ruby.
|
78
|
+
*
|
79
|
+
* This method pulls its data in from the instance variable `@data`. The
|
80
|
+
* Ruby side of the Lexer class should set this variable to a String in its
|
81
|
+
* constructor method. Encodings are passed along to make sure that token
|
82
|
+
* values share the same encoding as the input.
|
83
|
+
*
|
84
|
+
* This method always returns nil.
|
85
|
+
*/
|
86
|
+
@JRubyMethod
|
87
|
+
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
88
|
+
{
|
89
|
+
Encoding encoding = rb_str.getEncoding();
|
90
|
+
|
91
|
+
byte[] data = rb_str.getBytes();
|
92
|
+
|
93
|
+
int ts = 0;
|
94
|
+
int te = 0;
|
95
|
+
int p = 0;
|
96
|
+
int mark = 0;
|
97
|
+
int pe = data.length;
|
98
|
+
int eof = data.length;
|
99
|
+
|
100
|
+
%% write exec;
|
101
|
+
|
102
|
+
return context.nil;
|
103
|
+
}
|
104
|
+
|
105
|
+
/**
|
106
|
+
* Resets the internal state of the lexer.
|
107
|
+
*/
|
108
|
+
@JRubyMethod
|
109
|
+
public IRubyObject reset_native(ThreadContext context)
|
110
|
+
{
|
111
|
+
this.act = 0;
|
112
|
+
this.cs = java_lexer_start;
|
113
|
+
|
114
|
+
return context.nil;
|
115
|
+
}
|
116
|
+
|
117
|
+
/**
|
118
|
+
* Calls back in to Ruby land passing the current token value along.
|
119
|
+
*
|
120
|
+
* This method calls back in to Ruby land based on the method name
|
121
|
+
* specified in `name`. The Ruby callback should take one argument. This
|
122
|
+
* argument will be a String containing the value of the current token.
|
123
|
+
*/
|
124
|
+
public void callback(String name, byte[] data, Encoding enc, int ts, int te)
|
125
|
+
{
|
126
|
+
ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
|
127
|
+
|
128
|
+
RubyString value = this.runtime.newString(bytelist);
|
129
|
+
|
130
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
131
|
+
|
132
|
+
this.callMethod(context, name, value);
|
133
|
+
}
|
134
|
+
|
135
|
+
/**
|
136
|
+
* Calls back in to Ruby land without passing any arguments.
|
137
|
+
*/
|
138
|
+
public void callback_simple(String name)
|
139
|
+
{
|
140
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
141
|
+
|
142
|
+
this.callMethod(context, name);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
%%{
|
147
|
+
variable act this.act;
|
148
|
+
variable cs this.cs;
|
149
|
+
|
150
|
+
include base_lexer "base_lexer.rl";
|
151
|
+
}%%
|
@@ -0,0 +1,323 @@
|
|
1
|
+
%%machine base_lexer;
|
2
|
+
|
3
|
+
%%{
|
4
|
+
##
|
5
|
+
# Base grammar for the XML lexer.
|
6
|
+
#
|
7
|
+
# This grammar is shared between the C and Java extensions. As a result of
|
8
|
+
# this you should **not** include language specific code in Ragel
|
9
|
+
# actions/callbacks.
|
10
|
+
#
|
11
|
+
# To call back in to Ruby you can use one of the following two functions:
|
12
|
+
#
|
13
|
+
# * callback
|
14
|
+
# * callback_simple
|
15
|
+
#
|
16
|
+
# The first function takes 5 arguments:
|
17
|
+
#
|
18
|
+
# * The name of the Ruby method to call.
|
19
|
+
# * The input data.
|
20
|
+
# * The encoding of the input data.
|
21
|
+
# * The start of the current buffer.
|
22
|
+
# * The end of the current buffer.
|
23
|
+
#
|
24
|
+
# The function callback_simple only takes one argument: the name of the
|
25
|
+
# method to call. This function should be used for callbacks that don't
|
26
|
+
# require any values.
|
27
|
+
#
|
28
|
+
# When you call a method in Ruby make sure that said method is defined as
|
29
|
+
# an instance method in the `Oga::XML::Lexer` class.
|
30
|
+
#
|
31
|
+
# ## Machine Transitions
|
32
|
+
#
|
33
|
+
# To transition from one machine to another always use `fnext` instead of
|
34
|
+
# `fcall` and `fret`. This removes the need for the code to keep track of a
|
35
|
+
# stack.
|
36
|
+
#
|
37
|
+
|
38
|
+
newline = '\n' | '\r\n';
|
39
|
+
whitespace = [ \t];
|
40
|
+
ident_char = [a-zA-Z0-9\-_];
|
41
|
+
identifier = ident_char+;
|
42
|
+
|
43
|
+
# Comments
|
44
|
+
#
|
45
|
+
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
46
|
+
#
|
47
|
+
# Unlike the W3 specification these rules *do* allow character sequences
|
48
|
+
# such as `--` and `->`. Putting extra checks in for these sequences would
|
49
|
+
# actually make the rules/actions more complex.
|
50
|
+
#
|
51
|
+
|
52
|
+
comment_start = '<!--';
|
53
|
+
comment_end = '-->';
|
54
|
+
comment = comment_start (any* -- comment_end) comment_end;
|
55
|
+
|
56
|
+
action start_comment {
|
57
|
+
callback("on_comment", data, encoding, ts + 4, te - 3);
|
58
|
+
}
|
59
|
+
|
60
|
+
# CDATA
|
61
|
+
#
|
62
|
+
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
63
|
+
#
|
64
|
+
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
65
|
+
# support them but treats their contents as plain text.
|
66
|
+
#
|
67
|
+
|
68
|
+
cdata_start = '<![CDATA[';
|
69
|
+
cdata_end = ']]>';
|
70
|
+
cdata = cdata_start (any* -- cdata_end) cdata_end;
|
71
|
+
|
72
|
+
action start_cdata {
|
73
|
+
callback("on_cdata", data, encoding, ts + 9, te - 3);
|
74
|
+
}
|
75
|
+
|
76
|
+
# Processing Instructions
|
77
|
+
#
|
78
|
+
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
79
|
+
# http://en.wikipedia.org/wiki/Processing_Instruction
|
80
|
+
#
|
81
|
+
# These are tags meant to be used by parsers/libraries for custom behaviour.
|
82
|
+
# One example are the tags used by PHP: <?php and ?>. Note that the XML
|
83
|
+
# declaration tags (<?xml ?>) are not considered to be a processing
|
84
|
+
# instruction.
|
85
|
+
#
|
86
|
+
|
87
|
+
proc_ins_start = '<?' identifier;
|
88
|
+
proc_ins_end = '?>';
|
89
|
+
|
90
|
+
action start_proc_ins {
|
91
|
+
callback_simple("on_proc_ins_start");
|
92
|
+
callback("on_proc_ins_name", data, encoding, ts + 2, te);
|
93
|
+
|
94
|
+
mark = te;
|
95
|
+
|
96
|
+
fnext proc_ins_body;
|
97
|
+
}
|
98
|
+
|
99
|
+
proc_ins_body := |*
|
100
|
+
proc_ins_end => {
|
101
|
+
callback("on_text", data, encoding, mark, ts);
|
102
|
+
callback_simple("on_proc_ins_end");
|
103
|
+
|
104
|
+
fnext main;
|
105
|
+
};
|
106
|
+
|
107
|
+
any;
|
108
|
+
*|;
|
109
|
+
|
110
|
+
# Strings
|
111
|
+
#
|
112
|
+
# Strings in HTML can either be single or double quoted. If a string
|
113
|
+
# starts with one of these quotes it must be closed with the same type
|
114
|
+
# of quote.
|
115
|
+
#
|
116
|
+
dquote = '"';
|
117
|
+
squote = "'";
|
118
|
+
|
119
|
+
string_dquote = (dquote ^dquote* dquote);
|
120
|
+
string_squote = (squote ^squote* squote);
|
121
|
+
|
122
|
+
string = string_dquote | string_squote;
|
123
|
+
|
124
|
+
action emit_string {
|
125
|
+
callback("on_string", data, encoding, ts + 1, te - 1);
|
126
|
+
}
|
127
|
+
|
128
|
+
# DOCTYPES
|
129
|
+
#
|
130
|
+
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
131
|
+
#
|
132
|
+
# These rules support the 3 flavours of doctypes:
|
133
|
+
#
|
134
|
+
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
135
|
+
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
136
|
+
# 3. Legacy doctypes
|
137
|
+
#
|
138
|
+
doctype_start = '<!DOCTYPE'i whitespace+;
|
139
|
+
|
140
|
+
action start_doctype {
|
141
|
+
callback_simple("on_doctype_start");
|
142
|
+
fnext doctype;
|
143
|
+
}
|
144
|
+
|
145
|
+
# Machine for processing doctypes. Doctype values such as the public
|
146
|
+
# and system IDs are treated as T_STRING tokens.
|
147
|
+
doctype := |*
|
148
|
+
'PUBLIC' | 'SYSTEM' => {
|
149
|
+
callback("on_doctype_type", data, encoding, ts, te);
|
150
|
+
};
|
151
|
+
|
152
|
+
# Consumes everything between the [ and ]. Due to the use of :> the ]
|
153
|
+
# is not consumed by any+.
|
154
|
+
'[' any+ :> ']' => {
|
155
|
+
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
|
156
|
+
};
|
157
|
+
|
158
|
+
# Lex the public/system IDs as regular strings.
|
159
|
+
string => emit_string;
|
160
|
+
|
161
|
+
# Whitespace inside doctypes is ignored since there's no point in
|
162
|
+
# including it.
|
163
|
+
whitespace;
|
164
|
+
|
165
|
+
identifier => {
|
166
|
+
callback("on_doctype_name", data, encoding, ts, te);
|
167
|
+
};
|
168
|
+
|
169
|
+
'>' => {
|
170
|
+
callback_simple("on_doctype_end");
|
171
|
+
fnext main;
|
172
|
+
};
|
173
|
+
*|;
|
174
|
+
|
175
|
+
# XML declaration tags
|
176
|
+
#
|
177
|
+
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
178
|
+
#
|
179
|
+
xml_decl_start = '<?xml';
|
180
|
+
xml_decl_end = '?>';
|
181
|
+
|
182
|
+
action start_xml_decl {
|
183
|
+
callback_simple("on_xml_decl_start");
|
184
|
+
fnext xml_decl;
|
185
|
+
}
|
186
|
+
|
187
|
+
# Machine that processes the contents of an XML declaration tag.
|
188
|
+
xml_decl := |*
|
189
|
+
xml_decl_end => {
|
190
|
+
callback_simple("on_xml_decl_end");
|
191
|
+
fnext main;
|
192
|
+
};
|
193
|
+
|
194
|
+
# Attributes and their values (e.g. version="1.0").
|
195
|
+
identifier => {
|
196
|
+
callback("on_attribute", data, encoding, ts, te);
|
197
|
+
};
|
198
|
+
|
199
|
+
string => emit_string;
|
200
|
+
|
201
|
+
any;
|
202
|
+
*|;
|
203
|
+
|
204
|
+
# Elements
|
205
|
+
#
|
206
|
+
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
207
|
+
#
|
208
|
+
# Lexing of elements is broken up into different machines that handle the
|
209
|
+
# name/namespace, contents of the open tag and the body of an element. The
|
210
|
+
# body of an element is lexed using the `main` machine.
|
211
|
+
#
|
212
|
+
|
213
|
+
element_start = '<' ident_char;
|
214
|
+
element_end = '</' identifier (':' identifier)* '>';
|
215
|
+
|
216
|
+
action start_element {
|
217
|
+
callback_simple("on_element_start");
|
218
|
+
fhold;
|
219
|
+
fnext element_name;
|
220
|
+
}
|
221
|
+
|
222
|
+
action close_element {
|
223
|
+
callback_simple("on_element_end");
|
224
|
+
}
|
225
|
+
|
226
|
+
# Machine used for lexing the name/namespace of an element.
|
227
|
+
element_name := |*
|
228
|
+
identifier ':' => {
|
229
|
+
callback("on_element_ns", data, encoding, ts, te - 1);
|
230
|
+
};
|
231
|
+
|
232
|
+
identifier => {
|
233
|
+
callback("on_element_name", data, encoding, ts, te);
|
234
|
+
fnext element_head;
|
235
|
+
};
|
236
|
+
*|;
|
237
|
+
|
238
|
+
# Machine used for processing the contents of an element's starting tag.
|
239
|
+
# This includes the name, namespace and attributes.
|
240
|
+
element_head := |*
|
241
|
+
whitespace | '=';
|
242
|
+
|
243
|
+
newline => {
|
244
|
+
callback_simple("advance_line");
|
245
|
+
};
|
246
|
+
|
247
|
+
# Attribute names and namespaces.
|
248
|
+
identifier ':' => {
|
249
|
+
callback("on_attribute_ns", data, encoding, ts, te - 1);
|
250
|
+
};
|
251
|
+
|
252
|
+
identifier => {
|
253
|
+
callback("on_attribute", data, encoding, ts, te);
|
254
|
+
};
|
255
|
+
|
256
|
+
# Attribute values.
|
257
|
+
string => emit_string;
|
258
|
+
|
259
|
+
# We're done with the open tag of the element.
|
260
|
+
'>' => {
|
261
|
+
callback_simple("on_element_open_end");
|
262
|
+
fnext main;
|
263
|
+
};
|
264
|
+
|
265
|
+
# Self closing tags.
|
266
|
+
'/>' => {
|
267
|
+
callback_simple("on_element_end");
|
268
|
+
fnext main;
|
269
|
+
};
|
270
|
+
*|;
|
271
|
+
|
272
|
+
# Text
|
273
|
+
#
|
274
|
+
# http://www.w3.org/TR/xml/#syntax
|
275
|
+
# http://www.w3.org/TR/html-markup/syntax.html#text-syntax
|
276
|
+
#
|
277
|
+
# Text content is everything leading up to certain special tags such as "</"
|
278
|
+
# and "<?".
|
279
|
+
|
280
|
+
action start_text {
|
281
|
+
fhold;
|
282
|
+
fnext text;
|
283
|
+
}
|
284
|
+
|
285
|
+
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
|
286
|
+
# back to the main machine.
|
287
|
+
#
|
288
|
+
# Note that this only works if each sequence is exactly 2 characters
|
289
|
+
# long. Because of this "<!" is used instead of "<!--".
|
290
|
+
|
291
|
+
terminate_text = '</' | '<!' | '<?' | element_start;
|
292
|
+
allowed_text = any* -- terminate_text;
|
293
|
+
|
294
|
+
text := |*
|
295
|
+
# Text followed by a special tag, such as "foo<!--"
|
296
|
+
allowed_text @{ mark = p; } terminate_text => {
|
297
|
+
callback("on_text", data, encoding, ts, mark);
|
298
|
+
|
299
|
+
p = mark - 1;
|
300
|
+
mark = 0;
|
301
|
+
|
302
|
+
fnext main;
|
303
|
+
};
|
304
|
+
|
305
|
+
# Just regular text.
|
306
|
+
allowed_text => {
|
307
|
+
callback("on_text", data, encoding, ts, te);
|
308
|
+
fnext main;
|
309
|
+
};
|
310
|
+
*|;
|
311
|
+
|
312
|
+
# The main machine aka the entry point of Ragel.
|
313
|
+
main := |*
|
314
|
+
doctype_start => start_doctype;
|
315
|
+
xml_decl_start => start_xml_decl;
|
316
|
+
comment => start_comment;
|
317
|
+
cdata => start_cdata;
|
318
|
+
proc_ins_start => start_proc_ins;
|
319
|
+
element_start => start_element;
|
320
|
+
element_end => close_element;
|
321
|
+
any => start_text;
|
322
|
+
*|;
|
323
|
+
}%%
|