rubyjedi-oga 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,223 @@
1
+ package org.liboga.xml;
2
+
3
+ %%machine java_lexer;
4
+
5
+ import java.io.IOException;
6
+
7
+ import org.jcodings.Encoding;
8
+
9
+ import org.jruby.Ruby;
10
+ import org.jruby.RubyModule;
11
+ import org.jruby.RubyClass;
12
+ import org.jruby.RubyObject;
13
+ import org.jruby.RubyString;
14
+ import org.jruby.RubyFixnum;
15
+ import org.jruby.util.ByteList;
16
+ import org.jruby.anno.JRubyClass;
17
+ import org.jruby.anno.JRubyMethod;
18
+ import org.jruby.runtime.ThreadContext;
19
+ import org.jruby.runtime.ObjectAllocator;
20
+ import org.jruby.runtime.builtin.IRubyObject;
21
+
22
+ /**
23
+ * Lexer support class for JRuby.
24
+ *
25
+ * The Lexer class contains the raw Ragel loop and calls back in to Ruby land
26
+ * whenever a Ragel action is needed similar to the C extension setup.
27
+ *
28
+ * This class requires Ruby land to first define the `Oga::XML` namespace.
29
+ */
30
+ @JRubyClass(name="Oga::XML::Lexer", parent="Object")
31
+ public class Lexer extends RubyObject
32
+ {
33
+ /**
34
+ * The current Ruby runtime.
35
+ */
36
+ private Ruby runtime;
37
+
38
+ %% write data;
39
+
40
+ /* Used by Ragel to keep track of the current state. */
41
+ int act;
42
+ int cs;
43
+ int top;
44
+ int lines;
45
+ int[] stack;
46
+
47
+ /**
48
+ * Sets up the current class in the Ruby runtime.
49
+ */
50
+ public static void load(Ruby runtime)
51
+ {
52
+ RubyModule xml = (RubyModule) runtime.getModule("Oga")
53
+ .getConstant("XML");
54
+
55
+ RubyClass lexer = xml.defineClassUnder(
56
+ "Lexer",
57
+ runtime.getObject(),
58
+ ALLOCATOR
59
+ );
60
+
61
+ lexer.defineAnnotatedMethods(Lexer.class);
62
+ }
63
+
64
+ private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
65
+ {
66
+ public IRubyObject allocate(Ruby runtime, RubyClass klass)
67
+ {
68
+ return new org.liboga.xml.Lexer(runtime, klass);
69
+ }
70
+ };
71
+
72
+ public Lexer(Ruby runtime, RubyClass klass)
73
+ {
74
+ super(runtime, klass);
75
+
76
+ this.runtime = runtime;
77
+ }
78
+
79
+ /**
80
+ * Runs the bulk of the Ragel loop and calls back in to Ruby.
81
+ *
82
+ * This method pulls its data in from the instance variable `@data`. The
83
+ * Ruby side of the Lexer class should set this variable to a String in its
84
+ * constructor method. Encodings are passed along to make sure that token
85
+ * values share the same encoding as the input.
86
+ *
87
+ * This method always returns nil.
88
+ */
89
+ @JRubyMethod
90
+ public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
91
+ {
92
+ Boolean html_p = this.callMethod(context, "html?").isTrue();
93
+
94
+ Encoding encoding = rb_str.getEncoding();
95
+
96
+ byte[] data = rb_str.getBytes();
97
+
98
+ int ts = 0;
99
+ int te = 0;
100
+ int p = 0;
101
+ int mark = 0;
102
+ int lines = this.lines;
103
+ int pe = data.length;
104
+ int eof = data.length;
105
+
106
+ String id_advance_line = "advance_line";
107
+ String id_on_attribute = "on_attribute";
108
+ String id_on_attribute_ns = "on_attribute_ns";
109
+ String id_on_cdata_start = "on_cdata_start";
110
+ String id_on_cdata_body = "on_cdata_body";
111
+ String id_on_cdata_end = "on_cdata_end";
112
+ String id_on_comment_start = "on_comment_start";
113
+ String id_on_comment_body = "on_comment_body";
114
+ String id_on_comment_end = "on_comment_end";
115
+ String id_on_doctype_end = "on_doctype_end";
116
+ String id_on_doctype_inline = "on_doctype_inline";
117
+ String id_on_doctype_name = "on_doctype_name";
118
+ String id_on_doctype_start = "on_doctype_start";
119
+ String id_on_doctype_type = "on_doctype_type";
120
+ String id_on_element_end = "on_element_end";
121
+ String id_on_element_name = "on_element_name";
122
+ String id_on_element_ns = "on_element_ns";
123
+ String id_on_element_open_end = "on_element_open_end";
124
+ String id_on_proc_ins_end = "on_proc_ins_end";
125
+ String id_on_proc_ins_name = "on_proc_ins_name";
126
+ String id_on_proc_ins_start = "on_proc_ins_start";
127
+ String id_on_proc_ins_body = "on_proc_ins_body";
128
+ String id_on_string_body = "on_string_body";
129
+ String id_on_string_dquote = "on_string_dquote";
130
+ String id_on_string_squote = "on_string_squote";
131
+ String id_on_text = "on_text";
132
+ String id_on_xml_decl_end = "on_xml_decl_end";
133
+ String id_on_xml_decl_start = "on_xml_decl_start";
134
+
135
+ %% write exec;
136
+
137
+ this.lines = lines;
138
+
139
+ return context.nil;
140
+ }
141
+
142
+ /**
143
+ * Resets the internal state of the lexer.
144
+ */
145
+ @JRubyMethod
146
+ public IRubyObject reset_native(ThreadContext context)
147
+ {
148
+ this.act = 0;
149
+ this.top = 0;
150
+ this.stack = new int[4];
151
+ this.cs = java_lexer_start;
152
+
153
+ return context.nil;
154
+ }
155
+
156
+ /**
157
+ * Calls back in to Ruby land passing the current token value along.
158
+ *
159
+ * This method calls back in to Ruby land based on the method name
160
+ * specified in `name`. The Ruby callback should take one argument. This
161
+ * argument will be a String containing the value of the current token.
162
+ */
163
+ public void callback(String name, byte[] data, Encoding enc, int ts, int te)
164
+ {
165
+ ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
166
+
167
+ RubyString value = this.runtime.newString(bytelist);
168
+
169
+ ThreadContext context = this.runtime.getCurrentContext();
170
+
171
+ this.callMethod(context, name, value);
172
+ }
173
+
174
+ /**
175
+ * Calls back in to Ruby land without passing any arguments.
176
+ */
177
+ public void callback_simple(String name)
178
+ {
179
+ ThreadContext context = this.runtime.getCurrentContext();
180
+
181
+ this.callMethod(context, name);
182
+ }
183
+
184
+ /**
185
+ * Advances the line number by `amount` lines.
186
+ */
187
+ public void advance_line(int amount)
188
+ {
189
+ ThreadContext context = this.runtime.getCurrentContext();
190
+ RubyFixnum lines = this.runtime.newFixnum(amount);
191
+
192
+ this.callMethod(context, "advance_line", lines);
193
+ }
194
+
195
+ /**
196
+ * @see Oga::XML::Lexer#html_script?
197
+ */
198
+ public Boolean html_script_p()
199
+ {
200
+ ThreadContext context = this.runtime.getCurrentContext();
201
+
202
+ return this.callMethod(context, "html_script?").isTrue();
203
+ }
204
+
205
+ /**
206
+ * @see Oga::XML::Lexer#html_style?
207
+ */
208
+ public Boolean html_style_p()
209
+ {
210
+ ThreadContext context = this.runtime.getCurrentContext();
211
+
212
+ return this.callMethod(context, "html_style?").isTrue();
213
+ }
214
+ }
215
+
216
+ %%{
217
+ variable act this.act;
218
+ variable cs this.cs;
219
+ variable stack this.stack;
220
+ variable top this.top;
221
+
222
+ include base_lexer "base_lexer.rl";
223
+ }%%
@@ -0,0 +1,633 @@
1
+ %%machine base_lexer;
2
+
3
+ %%{
4
+ ##
5
+ # Base grammar for the XML lexer.
6
+ #
7
+ # This grammar is shared between the C and Java extensions. As a result of
8
+ # this you should **not** include language specific code in Ragel
9
+ # actions/callbacks.
10
+ #
11
+ # To call back in to Ruby you can use one of the following two functions:
12
+ #
13
+ # * callback
14
+ # * callback_simple
15
+ #
16
+ # The first function takes 5 arguments:
17
+ #
18
+ # * The name of the Ruby method to call.
19
+ # * The input data.
20
+ # * The encoding of the input data.
21
+ # * The start of the current buffer.
22
+ # * The end of the current buffer.
23
+ #
24
+ # The function callback_simple only takes one argument: the name of the
25
+ # method to call. This function should be used for callbacks that don't
26
+ # require any values.
27
+ #
28
+ # When you call a method in Ruby make sure that said method is defined as
29
+ # an instance method in the `Oga::XML::Lexer` class.
30
+ #
31
+ # The name of the callback to invoke should be an identifier starting with
32
+ # "id_". The identifier should be defined in the associated C and Java code.
33
+ # In case of C code its value should be a Symbol as a ID object, for Java
34
+ # it should be a String. For example:
35
+ #
36
+ # ID id_foo = rb_intern("foo");
37
+ #
38
+ # And for Java:
39
+ #
40
+ # String id_foo = "foo";
41
+ #
42
+ # ## Machine Transitions
43
+ #
44
+ # To transition from one machine to another always use `fnext` instead of
45
+ # `fcall` and `fret`. This removes the need for the code to keep track of a
46
+ # stack.
47
+ #
48
+
49
+ newline = '\r\n' | '\n' | '\r';
50
+ whitespace = [ \t];
51
+ ident_char = [a-zA-Z0-9\-_\.];
52
+ identifier = ident_char+;
53
+
54
+ whitespace_or_newline = whitespace | newline;
55
+
56
+ action count_newlines {
57
+ if ( fc == '\n' ) lines++;
58
+ }
59
+
60
+ action advance_newline {
61
+ advance_line(1);
62
+ }
63
+
64
+ action hold_and_return {
65
+ fhold;
66
+ fret;
67
+ }
68
+
69
+ # Comments
70
+ #
71
+ # http://www.w3.org/TR/html/syntax.html#comments
72
+ #
73
+ # Unlike the W3C specification these rules *do* allow character sequences
74
+ # such as `--` and `->`. Putting extra checks in for these sequences would
75
+ # actually make the rules/actions more complex.
76
+ #
77
+
78
+ comment_start = '<!--';
79
+ comment_end = '-->';
80
+
81
+ # Everything except "-" OR a single "-"
82
+ comment_allowed = (^'-'+ | '-') $count_newlines;
83
+
84
+ action start_comment {
85
+ callback_simple(id_on_comment_start);
86
+
87
+ fnext comment_body;
88
+ }
89
+
90
+ comment_body := |*
91
+ comment_allowed => {
92
+ callback(id_on_comment_body, data, encoding, ts, te);
93
+
94
+ if ( lines > 0 )
95
+ {
96
+ advance_line(lines);
97
+
98
+ lines = 0;
99
+ }
100
+ };
101
+
102
+ comment_end => {
103
+ callback_simple(id_on_comment_end);
104
+
105
+ fnext main;
106
+ };
107
+ *|;
108
+
109
+ # CDATA
110
+ #
111
+ # http://www.w3.org/TR/html/syntax.html#cdata-sections
112
+ #
113
+ # In HTML CDATA tags have no meaning/are not supported. Oga does
114
+ # support them but treats their contents as plain text.
115
+ #
116
+
117
+ cdata_start = '<![CDATA[';
118
+ cdata_end = ']]>';
119
+
120
+ # Everything except "]" OR a single "]"
121
+ cdata_allowed = (^']'+ | ']') $count_newlines;
122
+
123
+ action start_cdata {
124
+ callback_simple(id_on_cdata_start);
125
+
126
+ fnext cdata_body;
127
+ }
128
+
129
+ cdata_body := |*
130
+ cdata_allowed => {
131
+ callback(id_on_cdata_body, data, encoding, ts, te);
132
+
133
+ if ( lines > 0 )
134
+ {
135
+ advance_line(lines);
136
+
137
+ lines = 0;
138
+ }
139
+ };
140
+
141
+ cdata_end => {
142
+ callback_simple(id_on_cdata_end);
143
+
144
+ fnext main;
145
+ };
146
+ *|;
147
+
148
+ # Processing Instructions
149
+ #
150
+ # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
151
+ # http://en.wikipedia.org/wiki/Processing_Instruction
152
+ #
153
+ # These are tags meant to be used by parsers/libraries for custom behaviour.
154
+ # One example are the tags used by PHP: <?php and ?>. Note that the XML
155
+ # declaration tags (<?xml ?>) are not considered to be a processing
156
+ # instruction.
157
+ #
158
+
159
+ proc_ins_start = '<?' identifier;
160
+ proc_ins_end = '?>';
161
+
162
+ # Everything except "?" OR a single "?"
163
+ proc_ins_allowed = (^'?'+ | '?') $count_newlines;
164
+
165
+ action start_proc_ins {
166
+ callback_simple(id_on_proc_ins_start);
167
+ callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
168
+
169
+ fnext proc_ins_body;
170
+ }
171
+
172
+ proc_ins_body := |*
173
+ proc_ins_allowed => {
174
+ callback(id_on_proc_ins_body, data, encoding, ts, te);
175
+
176
+ if ( lines > 0 )
177
+ {
178
+ advance_line(lines);
179
+
180
+ lines = 0;
181
+ }
182
+ };
183
+
184
+ proc_ins_end => {
185
+ callback_simple(id_on_proc_ins_end);
186
+
187
+ fnext main;
188
+ };
189
+ *|;
190
+
191
+ # Strings
192
+ #
193
+ # Strings in HTML can either be single or double quoted. If a string
194
+ # starts with one of these quotes it must be closed with the same type
195
+ # of quote.
196
+ #
197
+ dquote = '"';
198
+ squote = "'";
199
+
200
+ action emit_string {
201
+ callback(id_on_string_body, data, encoding, ts, te);
202
+
203
+ if ( lines > 0 )
204
+ {
205
+ advance_line(lines);
206
+
207
+ lines = 0;
208
+ }
209
+ }
210
+
211
+ action start_string_squote {
212
+ callback_simple(id_on_string_squote);
213
+
214
+ fcall string_squote;
215
+ }
216
+
217
+ action start_string_dquote {
218
+ callback_simple(id_on_string_dquote);
219
+
220
+ fcall string_dquote;
221
+ }
222
+
223
+ string_squote := |*
224
+ ^squote* $count_newlines => emit_string;
225
+
226
+ squote => {
227
+ callback_simple(id_on_string_squote);
228
+
229
+ fret;
230
+ };
231
+ *|;
232
+
233
+ string_dquote := |*
234
+ ^dquote* $count_newlines => emit_string;
235
+
236
+ dquote => {
237
+ callback_simple(id_on_string_dquote);
238
+
239
+ fret;
240
+ };
241
+ *|;
242
+
243
+ # DOCTYPES
244
+ #
245
+ # http://www.w3.org/TR/html/syntax.html#the-doctype
246
+ #
247
+ # These rules support the 3 flavours of doctypes:
248
+ #
249
+ # 1. Normal doctypes, as introduced in the HTML5 specification.
250
+ # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
251
+ # 3. Legacy doctypes
252
+ #
253
+ doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
254
+
255
+ action start_doctype {
256
+ callback_simple(id_on_doctype_start);
257
+
258
+ if ( lines > 0 )
259
+ {
260
+ advance_line(lines);
261
+
262
+ lines = 0;
263
+ }
264
+
265
+ fnext doctype;
266
+ }
267
+
268
+ # Machine for processing inline rules of a doctype.
269
+ doctype_inline := |*
270
+ ^']'* $count_newlines => {
271
+ callback(id_on_doctype_inline, data, encoding, ts, te);
272
+
273
+ if ( lines > 0 )
274
+ {
275
+ advance_line(lines);
276
+
277
+ lines = 0;
278
+ }
279
+ };
280
+
281
+ ']' => { fnext doctype; };
282
+ *|;
283
+
284
+ # Machine for processing doctypes. Doctype values such as the public
285
+ # and system IDs are treated as T_STRING tokens.
286
+ doctype := |*
287
+ 'PUBLIC' | 'SYSTEM' => {
288
+ callback(id_on_doctype_type, data, encoding, ts, te);
289
+ };
290
+
291
+ # Starts a set of inline doctype rules.
292
+ '[' => { fnext doctype_inline; };
293
+
294
+ # Lex the public/system IDs as regular strings.
295
+ squote => start_string_squote;
296
+ dquote => start_string_dquote;
297
+
298
+ identifier => {
299
+ callback(id_on_doctype_name, data, encoding, ts, te);
300
+ };
301
+
302
+ '>' => {
303
+ callback_simple(id_on_doctype_end);
304
+ fnext main;
305
+ };
306
+
307
+ newline => advance_newline;
308
+
309
+ whitespace;
310
+ *|;
311
+
312
+ # XML declaration tags
313
+ #
314
+ # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
315
+ #
316
+ xml_decl_start = '<?xml';
317
+ xml_decl_end = '?>';
318
+
319
+ action start_xml_decl {
320
+ callback_simple(id_on_xml_decl_start);
321
+ fnext xml_decl;
322
+ }
323
+
324
+ # Machine that processes the contents of an XML declaration tag.
325
+ xml_decl := |*
326
+ xml_decl_end => {
327
+ if ( lines > 0 )
328
+ {
329
+ advance_line(lines);
330
+
331
+ lines = 0;
332
+ }
333
+
334
+ callback_simple(id_on_xml_decl_end);
335
+
336
+ fnext main;
337
+ };
338
+
339
+ # Attributes and their values (e.g. version="1.0").
340
+ identifier => {
341
+ if ( lines > 0 )
342
+ {
343
+ advance_line(lines);
344
+
345
+ lines = 0;
346
+ }
347
+
348
+ callback(id_on_attribute, data, encoding, ts, te);
349
+ };
350
+
351
+ squote => start_string_squote;
352
+ dquote => start_string_dquote;
353
+
354
+ any $count_newlines;
355
+ *|;
356
+
357
+ # Elements
358
+ #
359
+ # http://www.w3.org/TR/html/syntax.html#syntax-elements
360
+ #
361
+ # Lexing of elements is broken up into different machines that handle the
362
+ # name/namespace, contents of the open tag and the body of an element. The
363
+ # body of an element is lexed using the `main` machine.
364
+ #
365
+
366
+ action start_element {
367
+ fhold;
368
+ fnext element_name;
369
+ }
370
+
371
+ action start_close_element {
372
+ fnext element_close;
373
+ }
374
+
375
+ action close_element {
376
+ callback(id_on_element_end, data, encoding, ts, te);
377
+ }
378
+
379
+ action close_element_fnext_main {
380
+ callback_simple(id_on_element_end);
381
+
382
+ fnext main;
383
+ }
384
+
385
+ element_start = '<' ident_char;
386
+ element_end = '</';
387
+
388
+ # Machine used for lexing the name/namespace of an element.
389
+ element_name := |*
390
+ identifier ':' => {
391
+ callback(id_on_element_ns, data, encoding, ts, te - 1);
392
+ };
393
+
394
+ identifier => {
395
+ callback(id_on_element_name, data, encoding, ts, te);
396
+ fnext element_head;
397
+ };
398
+ *|;
399
+
400
+ # Machine used for lexing the closing tag of an element
401
+ element_close := |*
402
+ # namespace prefixes, currently not used but allows the rule below it
403
+ # to be used for the actual element name.
404
+ identifier ':';
405
+
406
+ identifier => close_element;
407
+
408
+ '>' => {
409
+ if ( lines > 0 )
410
+ {
411
+ advance_line(lines);
412
+
413
+ lines = 0;
414
+ }
415
+
416
+ fnext main;
417
+ };
418
+
419
+ any $count_newlines;
420
+ *|;
421
+
422
+ # Characters that can be used for unquoted HTML attribute values.
423
+ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
424
+ # for more info.
425
+ html_unquoted_value =
426
+ ^(squote | dquote | whitespace_or_newline)
427
+ ^('`' | '=' | '<' | '>' | whitespace_or_newline)+;
428
+
429
+ # Machine used after matching the "=" of an attribute and just before moving
430
+ # into the actual attribute value.
431
+ attribute_pre := |*
432
+ whitespace_or_newline $count_newlines;
433
+
434
+ any => {
435
+ fhold;
436
+
437
+ if ( lines > 0 )
438
+ {
439
+ advance_line(lines);
440
+
441
+ lines = 0;
442
+ }
443
+
444
+ if ( html_p )
445
+ {
446
+ fnext html_attribute_value;
447
+ }
448
+ else
449
+ {
450
+ fnext xml_attribute_value;
451
+ }
452
+ };
453
+ *|;
454
+
455
+ # Machine used for processing HTML attribute values.
456
+ html_attribute_value := |*
457
+ squote | dquote => {
458
+ fhold;
459
+ fnext xml_attribute_value;
460
+ };
461
+
462
+ # Unquoted attribute values are lexed as if they were single quoted
463
+ # strings.
464
+ html_unquoted_value => {
465
+ callback_simple(id_on_string_squote);
466
+
467
+ callback(id_on_string_body, data, encoding, ts, te);
468
+
469
+ callback_simple(id_on_string_squote);
470
+ };
471
+
472
+ any => hold_and_return;
473
+ *|;
474
+
475
+ # Machine used for processing XML attribute values.
476
+ xml_attribute_value := |*
477
+ # The following two actions use "fnext" instead of "fcall". Combined
478
+ # with "element_head" using "fcall" to jump to this machine this means
479
+ # we can return back to "element_head" after processing a single string.
480
+ squote => {
481
+ callback_simple(id_on_string_squote);
482
+
483
+ fnext string_squote;
484
+ };
485
+
486
+ dquote => {
487
+ callback_simple(id_on_string_dquote);
488
+
489
+ fnext string_dquote;
490
+ };
491
+
492
+ any => hold_and_return;
493
+ *|;
494
+
495
+ # Machine used for processing the contents of an element's starting tag.
496
+ # This includes the name, namespace and attributes.
497
+ element_head := |*
498
+ newline => advance_newline;
499
+
500
+ # Attribute names and namespaces.
501
+ identifier ':' => {
502
+ callback(id_on_attribute_ns, data, encoding, ts, te - 1);
503
+ };
504
+
505
+ identifier => {
506
+ callback(id_on_attribute, data, encoding, ts, te);
507
+ };
508
+
509
+ # Attribute values.
510
+ '=' => {
511
+ fcall attribute_pre;
512
+ };
513
+
514
+ # We're done with the open tag of the element.
515
+ '>' => {
516
+ callback_simple(id_on_element_open_end);
517
+
518
+ if ( html_script_p() )
519
+ {
520
+ fnext html_script;
521
+ }
522
+ else if ( html_style_p() )
523
+ {
524
+ fnext html_style;
525
+ }
526
+ else
527
+ {
528
+ fnext main;
529
+ }
530
+ };
531
+
532
+ # Self closing tags.
533
+ '/>' => {
534
+ callback_simple(id_on_element_end);
535
+ fnext main;
536
+ };
537
+
538
+ any;
539
+ *|;
540
+
541
+ # Text
542
+ #
543
+ # http://www.w3.org/TR/xml/#syntax
544
+ # http://www.w3.org/TR/html/syntax.html#text
545
+ #
546
+ # Text content is everything leading up to certain special tags such as "</"
547
+ # and "<?".
548
+
549
+ action start_text {
550
+ fhold;
551
+ fnext text;
552
+ }
553
+
554
+ # These characters terminate a T_TEXT sequence and instruct Ragel to jump
555
+ # back to the main machine.
556
+ #
557
+ # Note that this only works if each sequence is exactly 2 characters
558
+ # long. Because of this "<!" is used instead of "<!--".
559
+
560
+ terminate_text = '</' | '<!' | '<?' | element_start;
561
+ allowed_text = (any* -- terminate_text) $count_newlines;
562
+
563
+ action emit_text {
564
+ callback(id_on_text, data, encoding, ts, te);
565
+
566
+ if ( lines > 0 )
567
+ {
568
+ advance_line(lines);
569
+
570
+ lines = 0;
571
+ }
572
+ }
573
+
574
+ text := |*
575
+ terminate_text | allowed_text => {
576
+ callback(id_on_text, data, encoding, ts, te);
577
+
578
+ if ( lines > 0 )
579
+ {
580
+ advance_line(lines);
581
+
582
+ lines = 0;
583
+ }
584
+
585
+ fnext main;
586
+ };
587
+
588
+ # Text followed by a special tag, such as "foo<!--"
589
+ allowed_text %{ mark = p; } terminate_text => {
590
+ callback(id_on_text, data, encoding, ts, mark);
591
+
592
+ p = mark - 1;
593
+ mark = 0;
594
+
595
+ if ( lines > 0 )
596
+ {
597
+ advance_line(lines);
598
+
599
+ lines = 0;
600
+ }
601
+
602
+ fnext main;
603
+ };
604
+ *|;
605
+
606
+ # Certain tags in HTML can contain basically anything except for the literal
607
+ # closing tag. Two examples are script and style tags. As a result of this
608
+ # we can't use the regular text machine.
609
+
610
+ literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
611
+
612
+ html_script := |*
613
+ literal_html_allowed => emit_text;
614
+ '</script>' => close_element_fnext_main;
615
+ *|;
616
+
617
+ html_style := |*
618
+ literal_html_allowed => emit_text;
619
+ '</style>' => close_element_fnext_main;
620
+ *|;
621
+
622
+ # The main machine aka the entry point of Ragel.
623
+ main := |*
624
+ doctype_start => start_doctype;
625
+ xml_decl_start => start_xml_decl;
626
+ comment_start => start_comment;
627
+ cdata_start => start_cdata;
628
+ proc_ins_start => start_proc_ins;
629
+ element_start => start_element;
630
+ element_end => start_close_element;
631
+ any => start_text;
632
+ *|;
633
+ }%%