oga 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +171 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +7 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/oga.rb +43 -0
  20. data/lib/oga/html/parser.rb +25 -0
  21. data/lib/oga/oga.rb +27 -0
  22. data/lib/oga/version.rb +3 -0
  23. data/lib/oga/xml/attribute.rb +111 -0
  24. data/lib/oga/xml/cdata.rb +24 -0
  25. data/lib/oga/xml/character_node.rb +39 -0
  26. data/lib/oga/xml/comment.rb +24 -0
  27. data/lib/oga/xml/doctype.rb +91 -0
  28. data/lib/oga/xml/document.rb +99 -0
  29. data/lib/oga/xml/element.rb +340 -0
  30. data/lib/oga/xml/lexer.rb +399 -0
  31. data/lib/oga/xml/namespace.rb +42 -0
  32. data/lib/oga/xml/node.rb +175 -0
  33. data/lib/oga/xml/node_set.rb +313 -0
  34. data/lib/oga/xml/parser.rb +556 -0
  35. data/lib/oga/xml/processing_instruction.rb +39 -0
  36. data/lib/oga/xml/pull_parser.rb +166 -0
  37. data/lib/oga/xml/querying.rb +32 -0
  38. data/lib/oga/xml/text.rb +16 -0
  39. data/lib/oga/xml/traversal.rb +48 -0
  40. data/lib/oga/xml/xml_declaration.rb +76 -0
  41. data/lib/oga/xpath/evaluator.rb +1748 -0
  42. data/lib/oga/xpath/lexer.rb +2043 -0
  43. data/lib/oga/xpath/node.rb +10 -0
  44. data/lib/oga/xpath/parser.rb +535 -0
  45. data/oga.gemspec +45 -0
  46. metadata +221 -0
@@ -0,0 +1,151 @@
1
+ package org.liboga.xml;
2
+
3
+ %%machine java_lexer;
4
+
5
+ import java.io.IOException;
6
+
7
+ import org.jcodings.Encoding;
8
+
9
+ import org.jruby.Ruby;
10
+ import org.jruby.RubyModule;
11
+ import org.jruby.RubyClass;
12
+ import org.jruby.RubyObject;
13
+ import org.jruby.RubyString;
14
+ import org.jruby.RubyFixnum;
15
+ import org.jruby.util.ByteList;
16
+ import org.jruby.anno.JRubyClass;
17
+ import org.jruby.anno.JRubyMethod;
18
+ import org.jruby.runtime.ThreadContext;
19
+ import org.jruby.runtime.ObjectAllocator;
20
+ import org.jruby.runtime.builtin.IRubyObject;
21
+
22
+ /**
23
+ * Lexer support class for JRuby.
24
+ *
25
+ * The Lexer class contains the raw Ragel loop and calls back in to Ruby land
26
+ * whenever a Ragel action is needed similar to the C extension setup.
27
+ *
28
+ * This class requires Ruby land to first define the `Oga::XML` namespace.
29
+ */
30
+ @JRubyClass(name="Oga::XML::Lexer", parent="Object")
31
+ public class Lexer extends RubyObject
32
+ {
33
+ /**
34
+ * The current Ruby runtime.
35
+ */
36
+ private Ruby runtime;
37
+
38
+ %% write data;
39
+
40
+ /* Used by Ragel to keep track of the current state. */
41
+ int act;
42
+ int cs;
43
+
44
+ /**
45
+ * Sets up the current class in the Ruby runtime.
46
+ */
47
+ public static void load(Ruby runtime)
48
+ {
49
+ RubyModule xml = (RubyModule) runtime.getModule("Oga")
50
+ .getConstant("XML");
51
+
52
+ RubyClass lexer = xml.defineClassUnder(
53
+ "Lexer",
54
+ runtime.getObject(),
55
+ ALLOCATOR
56
+ );
57
+
58
+ lexer.defineAnnotatedMethods(Lexer.class);
59
+ }
60
+
61
+ private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
62
+ {
63
+ public IRubyObject allocate(Ruby runtime, RubyClass klass)
64
+ {
65
+ return new org.liboga.xml.Lexer(runtime, klass);
66
+ }
67
+ };
68
+
69
+ public Lexer(Ruby runtime, RubyClass klass)
70
+ {
71
+ super(runtime, klass);
72
+
73
+ this.runtime = runtime;
74
+ }
75
+
76
+ /**
77
+ * Runs the bulk of the Ragel loop and calls back in to Ruby.
78
+ *
79
+ * This method pulls its data in from the instance variable `@data`. The
80
+ * Ruby side of the Lexer class should set this variable to a String in its
81
+ * constructor method. Encodings are passed along to make sure that token
82
+ * values share the same encoding as the input.
83
+ *
84
+ * This method always returns nil.
85
+ */
86
+ @JRubyMethod
87
+ public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
88
+ {
89
+ Encoding encoding = rb_str.getEncoding();
90
+
91
+ byte[] data = rb_str.getBytes();
92
+
93
+ int ts = 0;
94
+ int te = 0;
95
+ int p = 0;
96
+ int mark = 0;
97
+ int pe = data.length;
98
+ int eof = data.length;
99
+
100
+ %% write exec;
101
+
102
+ return context.nil;
103
+ }
104
+
105
+ /**
106
+ * Resets the internal state of the lexer.
107
+ */
108
+ @JRubyMethod
109
+ public IRubyObject reset_native(ThreadContext context)
110
+ {
111
+ this.act = 0;
112
+ this.cs = java_lexer_start;
113
+
114
+ return context.nil;
115
+ }
116
+
117
+ /**
118
+ * Calls back in to Ruby land passing the current token value along.
119
+ *
120
+ * This method calls back in to Ruby land based on the method name
121
+ * specified in `name`. The Ruby callback should take one argument. This
122
+ * argument will be a String containing the value of the current token.
123
+ */
124
+ public void callback(String name, byte[] data, Encoding enc, int ts, int te)
125
+ {
126
+ ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
127
+
128
+ RubyString value = this.runtime.newString(bytelist);
129
+
130
+ ThreadContext context = this.runtime.getCurrentContext();
131
+
132
+ this.callMethod(context, name, value);
133
+ }
134
+
135
+ /**
136
+ * Calls back in to Ruby land without passing any arguments.
137
+ */
138
+ public void callback_simple(String name)
139
+ {
140
+ ThreadContext context = this.runtime.getCurrentContext();
141
+
142
+ this.callMethod(context, name);
143
+ }
144
+ }
145
+
146
+ %%{
147
+ variable act this.act;
148
+ variable cs this.cs;
149
+
150
+ include base_lexer "base_lexer.rl";
151
+ }%%
@@ -0,0 +1,323 @@
1
+ %%machine base_lexer;
2
+
3
+ %%{
4
+ ##
5
+ # Base grammar for the XML lexer.
6
+ #
7
+ # This grammar is shared between the C and Java extensions. As a result of
8
+ # this you should **not** include language specific code in Ragel
9
+ # actions/callbacks.
10
+ #
11
+ # To call back in to Ruby you can use one of the following two functions:
12
+ #
13
+ # * callback
14
+ # * callback_simple
15
+ #
16
+ # The first function takes 5 arguments:
17
+ #
18
+ # * The name of the Ruby method to call.
19
+ # * The input data.
20
+ # * The encoding of the input data.
21
+ # * The start of the current buffer.
22
+ # * The end of the current buffer.
23
+ #
24
+ # The function callback_simple only takes one argument: the name of the
25
+ # method to call. This function should be used for callbacks that don't
26
+ # require any values.
27
+ #
28
+ # When you call a method in Ruby make sure that said method is defined as
29
+ # an instance method in the `Oga::XML::Lexer` class.
30
+ #
31
+ # ## Machine Transitions
32
+ #
33
+ # To transition from one machine to another always use `fnext` instead of
34
+ # `fcall` and `fret`. This removes the need for the code to keep track of a
35
+ # stack.
36
+ #
37
+
38
+ newline = '\n' | '\r\n';
39
+ whitespace = [ \t];
40
+ ident_char = [a-zA-Z0-9\-_];
41
+ identifier = ident_char+;
42
+
43
+ # Comments
44
+ #
45
+ # http://www.w3.org/TR/html-markup/syntax.html#comments
46
+ #
47
+ # Unlike the W3 specification these rules *do* allow character sequences
48
+ # such as `--` and `->`. Putting extra checks in for these sequences would
49
+ # actually make the rules/actions more complex.
50
+ #
51
+
52
+ comment_start = '<!--';
53
+ comment_end = '-->';
54
+ comment = comment_start (any* -- comment_end) comment_end;
55
+
56
+ action start_comment {
57
+ callback("on_comment", data, encoding, ts + 4, te - 3);
58
+ }
59
+
60
+ # CDATA
61
+ #
62
+ # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
63
+ #
64
+ # In HTML CDATA tags have no meaning/are not supported. Oga does
65
+ # support them but treats their contents as plain text.
66
+ #
67
+
68
+ cdata_start = '<![CDATA[';
69
+ cdata_end = ']]>';
70
+ cdata = cdata_start (any* -- cdata_end) cdata_end;
71
+
72
+ action start_cdata {
73
+ callback("on_cdata", data, encoding, ts + 9, te - 3);
74
+ }
75
+
76
+ # Processing Instructions
77
+ #
78
+ # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
79
+ # http://en.wikipedia.org/wiki/Processing_Instruction
80
+ #
81
+ # These are tags meant to be used by parsers/libraries for custom behaviour.
82
+ # One example are the tags used by PHP: <?php and ?>. Note that the XML
83
+ # declaration tags (<?xml ?>) are not considered to be a processing
84
+ # instruction.
85
+ #
86
+
87
+ proc_ins_start = '<?' identifier;
88
+ proc_ins_end = '?>';
89
+
90
+ action start_proc_ins {
91
+ callback_simple("on_proc_ins_start");
92
+ callback("on_proc_ins_name", data, encoding, ts + 2, te);
93
+
94
+ mark = te;
95
+
96
+ fnext proc_ins_body;
97
+ }
98
+
99
+ proc_ins_body := |*
100
+ proc_ins_end => {
101
+ callback("on_text", data, encoding, mark, ts);
102
+ callback_simple("on_proc_ins_end");
103
+
104
+ fnext main;
105
+ };
106
+
107
+ any;
108
+ *|;
109
+
110
+ # Strings
111
+ #
112
+ # Strings in HTML can either be single or double quoted. If a string
113
+ # starts with one of these quotes it must be closed with the same type
114
+ # of quote.
115
+ #
116
+ dquote = '"';
117
+ squote = "'";
118
+
119
+ string_dquote = (dquote ^dquote* dquote);
120
+ string_squote = (squote ^squote* squote);
121
+
122
+ string = string_dquote | string_squote;
123
+
124
+ action emit_string {
125
+ callback("on_string", data, encoding, ts + 1, te - 1);
126
+ }
127
+
128
+ # DOCTYPES
129
+ #
130
+ # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
131
+ #
132
+ # These rules support the 3 flavours of doctypes:
133
+ #
134
+ # 1. Normal doctypes, as introduced in the HTML5 specification.
135
+ # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
136
+ # 3. Legacy doctypes
137
+ #
138
+ doctype_start = '<!DOCTYPE'i whitespace+;
139
+
140
+ action start_doctype {
141
+ callback_simple("on_doctype_start");
142
+ fnext doctype;
143
+ }
144
+
145
+ # Machine for processing doctypes. Doctype values such as the public
146
+ # and system IDs are treated as T_STRING tokens.
147
+ doctype := |*
148
+ 'PUBLIC' | 'SYSTEM' => {
149
+ callback("on_doctype_type", data, encoding, ts, te);
150
+ };
151
+
152
+ # Consumes everything between the [ and ]. Due to the use of :> the ]
153
+ # is not consumed by any+.
154
+ '[' any+ :> ']' => {
155
+ callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
156
+ };
157
+
158
+ # Lex the public/system IDs as regular strings.
159
+ string => emit_string;
160
+
161
+ # Whitespace inside doctypes is ignored since there's no point in
162
+ # including it.
163
+ whitespace;
164
+
165
+ identifier => {
166
+ callback("on_doctype_name", data, encoding, ts, te);
167
+ };
168
+
169
+ '>' => {
170
+ callback_simple("on_doctype_end");
171
+ fnext main;
172
+ };
173
+ *|;
174
+
175
+ # XML declaration tags
176
+ #
177
+ # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
178
+ #
179
+ xml_decl_start = '<?xml';
180
+ xml_decl_end = '?>';
181
+
182
+ action start_xml_decl {
183
+ callback_simple("on_xml_decl_start");
184
+ fnext xml_decl;
185
+ }
186
+
187
+ # Machine that processes the contents of an XML declaration tag.
188
+ xml_decl := |*
189
+ xml_decl_end => {
190
+ callback_simple("on_xml_decl_end");
191
+ fnext main;
192
+ };
193
+
194
+ # Attributes and their values (e.g. version="1.0").
195
+ identifier => {
196
+ callback("on_attribute", data, encoding, ts, te);
197
+ };
198
+
199
+ string => emit_string;
200
+
201
+ any;
202
+ *|;
203
+
204
+ # Elements
205
+ #
206
+ # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
207
+ #
208
+ # Lexing of elements is broken up into different machines that handle the
209
+ # name/namespace, contents of the open tag and the body of an element. The
210
+ # body of an element is lexed using the `main` machine.
211
+ #
212
+
213
+ element_start = '<' ident_char;
214
+ element_end = '</' identifier (':' identifier)* '>';
215
+
216
+ action start_element {
217
+ callback_simple("on_element_start");
218
+ fhold;
219
+ fnext element_name;
220
+ }
221
+
222
+ action close_element {
223
+ callback_simple("on_element_end");
224
+ }
225
+
226
+ # Machine used for lexing the name/namespace of an element.
227
+ element_name := |*
228
+ identifier ':' => {
229
+ callback("on_element_ns", data, encoding, ts, te - 1);
230
+ };
231
+
232
+ identifier => {
233
+ callback("on_element_name", data, encoding, ts, te);
234
+ fnext element_head;
235
+ };
236
+ *|;
237
+
238
+ # Machine used for processing the contents of an element's starting tag.
239
+ # This includes the name, namespace and attributes.
240
+ element_head := |*
241
+ whitespace | '=';
242
+
243
+ newline => {
244
+ callback_simple("advance_line");
245
+ };
246
+
247
+ # Attribute names and namespaces.
248
+ identifier ':' => {
249
+ callback("on_attribute_ns", data, encoding, ts, te - 1);
250
+ };
251
+
252
+ identifier => {
253
+ callback("on_attribute", data, encoding, ts, te);
254
+ };
255
+
256
+ # Attribute values.
257
+ string => emit_string;
258
+
259
+ # We're done with the open tag of the element.
260
+ '>' => {
261
+ callback_simple("on_element_open_end");
262
+ fnext main;
263
+ };
264
+
265
+ # Self closing tags.
266
+ '/>' => {
267
+ callback_simple("on_element_end");
268
+ fnext main;
269
+ };
270
+ *|;
271
+
272
+ # Text
273
+ #
274
+ # http://www.w3.org/TR/xml/#syntax
275
+ # http://www.w3.org/TR/html-markup/syntax.html#text-syntax
276
+ #
277
+ # Text content is everything leading up to certain special tags such as "</"
278
+ # and "<?".
279
+
280
+ action start_text {
281
+ fhold;
282
+ fnext text;
283
+ }
284
+
285
+ # These characters terminate a T_TEXT sequence and instruct Ragel to jump
286
+ # back to the main machine.
287
+ #
288
+ # Note that this only works if each sequence is exactly 2 characters
289
+ # long. Because of this "<!" is used instead of "<!--".
290
+
291
+ terminate_text = '</' | '<!' | '<?' | element_start;
292
+ allowed_text = any* -- terminate_text;
293
+
294
+ text := |*
295
+ # Text followed by a special tag, such as "foo<!--"
296
+ allowed_text @{ mark = p; } terminate_text => {
297
+ callback("on_text", data, encoding, ts, mark);
298
+
299
+ p = mark - 1;
300
+ mark = 0;
301
+
302
+ fnext main;
303
+ };
304
+
305
+ # Just regular text.
306
+ allowed_text => {
307
+ callback("on_text", data, encoding, ts, te);
308
+ fnext main;
309
+ };
310
+ *|;
311
+
312
+ # The main machine aka the entry point of Ragel.
313
+ main := |*
314
+ doctype_start => start_doctype;
315
+ xml_decl_start => start_xml_decl;
316
+ comment => start_comment;
317
+ cdata => start_cdata;
318
+ proc_ins_start => start_proc_ins;
319
+ element_start => start_element;
320
+ element_end => close_element;
321
+ any => start_text;
322
+ *|;
323
+ }%%