rubyjedi-oga 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,223 @@
1
+ package org.liboga.xml;
2
+
3
+ %%machine java_lexer;
4
+
5
+ import java.io.IOException;
6
+
7
+ import org.jcodings.Encoding;
8
+
9
+ import org.jruby.Ruby;
10
+ import org.jruby.RubyModule;
11
+ import org.jruby.RubyClass;
12
+ import org.jruby.RubyObject;
13
+ import org.jruby.RubyString;
14
+ import org.jruby.RubyFixnum;
15
+ import org.jruby.util.ByteList;
16
+ import org.jruby.anno.JRubyClass;
17
+ import org.jruby.anno.JRubyMethod;
18
+ import org.jruby.runtime.ThreadContext;
19
+ import org.jruby.runtime.ObjectAllocator;
20
+ import org.jruby.runtime.builtin.IRubyObject;
21
+
22
+ /**
23
+ * Lexer support class for JRuby.
24
+ *
25
+ * The Lexer class contains the raw Ragel loop and calls back in to Ruby land
26
+ * whenever a Ragel action is needed similar to the C extension setup.
27
+ *
28
+ * This class requires Ruby land to first define the `Oga::XML` namespace.
29
+ */
30
+ @JRubyClass(name="Oga::XML::Lexer", parent="Object")
31
+ public class Lexer extends RubyObject
32
+ {
33
+ /**
34
+ * The current Ruby runtime.
35
+ */
36
+ private Ruby runtime;
37
+
38
+ %% write data;
39
+
40
+ /* Used by Ragel to keep track of the current state. */
41
+ int act;
42
+ int cs;
43
+ int top;
44
+ int lines;
45
+ int[] stack;
46
+
47
+ /**
48
+ * Sets up the current class in the Ruby runtime.
49
+ */
50
+ public static void load(Ruby runtime)
51
+ {
52
+ RubyModule xml = (RubyModule) runtime.getModule("Oga")
53
+ .getConstant("XML");
54
+
55
+ RubyClass lexer = xml.defineClassUnder(
56
+ "Lexer",
57
+ runtime.getObject(),
58
+ ALLOCATOR
59
+ );
60
+
61
+ lexer.defineAnnotatedMethods(Lexer.class);
62
+ }
63
+
64
+ private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
65
+ {
66
+ public IRubyObject allocate(Ruby runtime, RubyClass klass)
67
+ {
68
+ return new org.liboga.xml.Lexer(runtime, klass);
69
+ }
70
+ };
71
+
72
+ public Lexer(Ruby runtime, RubyClass klass)
73
+ {
74
+ super(runtime, klass);
75
+
76
+ this.runtime = runtime;
77
+ }
78
+
79
+ /**
80
+ * Runs the bulk of the Ragel loop and calls back in to Ruby.
81
+ *
82
+ * This method pulls its data in from the instance variable `@data`. The
83
+ * Ruby side of the Lexer class should set this variable to a String in its
84
+ * constructor method. Encodings are passed along to make sure that token
85
+ * values share the same encoding as the input.
86
+ *
87
+ * This method always returns nil.
88
+ */
89
+ @JRubyMethod
90
+ public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
91
+ {
92
+ Boolean html_p = this.callMethod(context, "html?").isTrue();
93
+
94
+ Encoding encoding = rb_str.getEncoding();
95
+
96
+ byte[] data = rb_str.getBytes();
97
+
98
+ int ts = 0;
99
+ int te = 0;
100
+ int p = 0;
101
+ int mark = 0;
102
+ int lines = this.lines;
103
+ int pe = data.length;
104
+ int eof = data.length;
105
+
106
+ String id_advance_line = "advance_line";
107
+ String id_on_attribute = "on_attribute";
108
+ String id_on_attribute_ns = "on_attribute_ns";
109
+ String id_on_cdata_start = "on_cdata_start";
110
+ String id_on_cdata_body = "on_cdata_body";
111
+ String id_on_cdata_end = "on_cdata_end";
112
+ String id_on_comment_start = "on_comment_start";
113
+ String id_on_comment_body = "on_comment_body";
114
+ String id_on_comment_end = "on_comment_end";
115
+ String id_on_doctype_end = "on_doctype_end";
116
+ String id_on_doctype_inline = "on_doctype_inline";
117
+ String id_on_doctype_name = "on_doctype_name";
118
+ String id_on_doctype_start = "on_doctype_start";
119
+ String id_on_doctype_type = "on_doctype_type";
120
+ String id_on_element_end = "on_element_end";
121
+ String id_on_element_name = "on_element_name";
122
+ String id_on_element_ns = "on_element_ns";
123
+ String id_on_element_open_end = "on_element_open_end";
124
+ String id_on_proc_ins_end = "on_proc_ins_end";
125
+ String id_on_proc_ins_name = "on_proc_ins_name";
126
+ String id_on_proc_ins_start = "on_proc_ins_start";
127
+ String id_on_proc_ins_body = "on_proc_ins_body";
128
+ String id_on_string_body = "on_string_body";
129
+ String id_on_string_dquote = "on_string_dquote";
130
+ String id_on_string_squote = "on_string_squote";
131
+ String id_on_text = "on_text";
132
+ String id_on_xml_decl_end = "on_xml_decl_end";
133
+ String id_on_xml_decl_start = "on_xml_decl_start";
134
+
135
+ %% write exec;
136
+
137
+ this.lines = lines;
138
+
139
+ return context.nil;
140
+ }
141
+
142
+ /**
143
+ * Resets the internal state of the lexer.
144
+ */
145
+ @JRubyMethod
146
+ public IRubyObject reset_native(ThreadContext context)
147
+ {
148
+ this.act = 0;
149
+ this.top = 0;
150
+ this.stack = new int[4];
151
+ this.cs = java_lexer_start;
152
+
153
+ return context.nil;
154
+ }
155
+
156
+ /**
157
+ * Calls back in to Ruby land passing the current token value along.
158
+ *
159
+ * This method calls back in to Ruby land based on the method name
160
+ * specified in `name`. The Ruby callback should take one argument. This
161
+ * argument will be a String containing the value of the current token.
162
+ */
163
+ public void callback(String name, byte[] data, Encoding enc, int ts, int te)
164
+ {
165
+ ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
166
+
167
+ RubyString value = this.runtime.newString(bytelist);
168
+
169
+ ThreadContext context = this.runtime.getCurrentContext();
170
+
171
+ this.callMethod(context, name, value);
172
+ }
173
+
174
+ /**
175
+ * Calls back in to Ruby land without passing any arguments.
176
+ */
177
+ public void callback_simple(String name)
178
+ {
179
+ ThreadContext context = this.runtime.getCurrentContext();
180
+
181
+ this.callMethod(context, name);
182
+ }
183
+
184
+ /**
185
+ * Advances the line number by `amount` lines.
186
+ */
187
+ public void advance_line(int amount)
188
+ {
189
+ ThreadContext context = this.runtime.getCurrentContext();
190
+ RubyFixnum lines = this.runtime.newFixnum(amount);
191
+
192
+ this.callMethod(context, "advance_line", lines);
193
+ }
194
+
195
+ /**
196
+ * @see Oga::XML::Lexer#html_script?
197
+ */
198
+ public Boolean html_script_p()
199
+ {
200
+ ThreadContext context = this.runtime.getCurrentContext();
201
+
202
+ return this.callMethod(context, "html_script?").isTrue();
203
+ }
204
+
205
+ /**
206
+ * @see Oga::XML::Lexer#html_style?
207
+ */
208
+ public Boolean html_style_p()
209
+ {
210
+ ThreadContext context = this.runtime.getCurrentContext();
211
+
212
+ return this.callMethod(context, "html_style?").isTrue();
213
+ }
214
+ }
215
+
216
+ %%{
217
+ variable act this.act;
218
+ variable cs this.cs;
219
+ variable stack this.stack;
220
+ variable top this.top;
221
+
222
+ include base_lexer "base_lexer.rl";
223
+ }%%
@@ -0,0 +1,633 @@
1
+ %%machine base_lexer;
2
+
3
+ %%{
4
+ ##
5
+ # Base grammar for the XML lexer.
6
+ #
7
+ # This grammar is shared between the C and Java extensions. As a result of
8
+ # this you should **not** include language specific code in Ragel
9
+ # actions/callbacks.
10
+ #
11
+ # To call back in to Ruby you can use one of the following two functions:
12
+ #
13
+ # * callback
14
+ # * callback_simple
15
+ #
16
+ # The first function takes 5 arguments:
17
+ #
18
+ # * The name of the Ruby method to call.
19
+ # * The input data.
20
+ # * The encoding of the input data.
21
+ # * The start of the current buffer.
22
+ # * The end of the current buffer.
23
+ #
24
+ # The function callback_simple only takes one argument: the name of the
25
+ # method to call. This function should be used for callbacks that don't
26
+ # require any values.
27
+ #
28
+ # When you call a method in Ruby make sure that said method is defined as
29
+ # an instance method in the `Oga::XML::Lexer` class.
30
+ #
31
+ # The name of the callback to invoke should be an identifier starting with
32
+ # "id_". The identifier should be defined in the associated C and Java code.
33
+ # In case of C code its value should be a Symbol as a ID object, for Java
34
+ # it should be a String. For example:
35
+ #
36
+ # ID id_foo = rb_intern("foo");
37
+ #
38
+ # And for Java:
39
+ #
40
+ # String id_foo = "foo";
41
+ #
42
+ # ## Machine Transitions
43
+ #
44
+ # To transition from one machine to another always use `fnext` instead of
45
+ # `fcall` and `fret`. This removes the need for the code to keep track of a
46
+ # stack.
47
+ #
48
+
49
+ newline = '\r\n' | '\n' | '\r';
50
+ whitespace = [ \t];
51
+ ident_char = [a-zA-Z0-9\-_\.];
52
+ identifier = ident_char+;
53
+
54
+ whitespace_or_newline = whitespace | newline;
55
+
56
+ action count_newlines {
57
+ if ( fc == '\n' ) lines++;
58
+ }
59
+
60
+ action advance_newline {
61
+ advance_line(1);
62
+ }
63
+
64
+ action hold_and_return {
65
+ fhold;
66
+ fret;
67
+ }
68
+
69
+ # Comments
70
+ #
71
+ # http://www.w3.org/TR/html/syntax.html#comments
72
+ #
73
+ # Unlike the W3C specification these rules *do* allow character sequences
74
+ # such as `--` and `->`. Putting extra checks in for these sequences would
75
+ # actually make the rules/actions more complex.
76
+ #
77
+
78
+ comment_start = '<!--';
79
+ comment_end = '-->';
80
+
81
+ # Everything except "-" OR a single "-"
82
+ comment_allowed = (^'-'+ | '-') $count_newlines;
83
+
84
+ action start_comment {
85
+ callback_simple(id_on_comment_start);
86
+
87
+ fnext comment_body;
88
+ }
89
+
90
+ comment_body := |*
91
+ comment_allowed => {
92
+ callback(id_on_comment_body, data, encoding, ts, te);
93
+
94
+ if ( lines > 0 )
95
+ {
96
+ advance_line(lines);
97
+
98
+ lines = 0;
99
+ }
100
+ };
101
+
102
+ comment_end => {
103
+ callback_simple(id_on_comment_end);
104
+
105
+ fnext main;
106
+ };
107
+ *|;
108
+
109
+ # CDATA
110
+ #
111
+ # http://www.w3.org/TR/html/syntax.html#cdata-sections
112
+ #
113
+ # In HTML CDATA tags have no meaning/are not supported. Oga does
114
+ # support them but treats their contents as plain text.
115
+ #
116
+
117
+ cdata_start = '<![CDATA[';
118
+ cdata_end = ']]>';
119
+
120
+ # Everything except "]" OR a single "]"
121
+ cdata_allowed = (^']'+ | ']') $count_newlines;
122
+
123
+ action start_cdata {
124
+ callback_simple(id_on_cdata_start);
125
+
126
+ fnext cdata_body;
127
+ }
128
+
129
+ cdata_body := |*
130
+ cdata_allowed => {
131
+ callback(id_on_cdata_body, data, encoding, ts, te);
132
+
133
+ if ( lines > 0 )
134
+ {
135
+ advance_line(lines);
136
+
137
+ lines = 0;
138
+ }
139
+ };
140
+
141
+ cdata_end => {
142
+ callback_simple(id_on_cdata_end);
143
+
144
+ fnext main;
145
+ };
146
+ *|;
147
+
148
+ # Processing Instructions
149
+ #
150
+ # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
151
+ # http://en.wikipedia.org/wiki/Processing_Instruction
152
+ #
153
+ # These are tags meant to be used by parsers/libraries for custom behaviour.
154
+ # One example are the tags used by PHP: <?php and ?>. Note that the XML
155
+ # declaration tags (<?xml ?>) are not considered to be a processing
156
+ # instruction.
157
+ #
158
+
159
+ proc_ins_start = '<?' identifier;
160
+ proc_ins_end = '?>';
161
+
162
+ # Everything except "?" OR a single "?"
163
+ proc_ins_allowed = (^'?'+ | '?') $count_newlines;
164
+
165
+ action start_proc_ins {
166
+ callback_simple(id_on_proc_ins_start);
167
+ callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
168
+
169
+ fnext proc_ins_body;
170
+ }
171
+
172
+ proc_ins_body := |*
173
+ proc_ins_allowed => {
174
+ callback(id_on_proc_ins_body, data, encoding, ts, te);
175
+
176
+ if ( lines > 0 )
177
+ {
178
+ advance_line(lines);
179
+
180
+ lines = 0;
181
+ }
182
+ };
183
+
184
+ proc_ins_end => {
185
+ callback_simple(id_on_proc_ins_end);
186
+
187
+ fnext main;
188
+ };
189
+ *|;
190
+
191
+ # Strings
192
+ #
193
+ # Strings in HTML can either be single or double quoted. If a string
194
+ # starts with one of these quotes it must be closed with the same type
195
+ # of quote.
196
+ #
197
+ dquote = '"';
198
+ squote = "'";
199
+
200
+ action emit_string {
201
+ callback(id_on_string_body, data, encoding, ts, te);
202
+
203
+ if ( lines > 0 )
204
+ {
205
+ advance_line(lines);
206
+
207
+ lines = 0;
208
+ }
209
+ }
210
+
211
+ action start_string_squote {
212
+ callback_simple(id_on_string_squote);
213
+
214
+ fcall string_squote;
215
+ }
216
+
217
+ action start_string_dquote {
218
+ callback_simple(id_on_string_dquote);
219
+
220
+ fcall string_dquote;
221
+ }
222
+
223
+ string_squote := |*
224
+ ^squote* $count_newlines => emit_string;
225
+
226
+ squote => {
227
+ callback_simple(id_on_string_squote);
228
+
229
+ fret;
230
+ };
231
+ *|;
232
+
233
+ string_dquote := |*
234
+ ^dquote* $count_newlines => emit_string;
235
+
236
+ dquote => {
237
+ callback_simple(id_on_string_dquote);
238
+
239
+ fret;
240
+ };
241
+ *|;
242
+
243
+ # DOCTYPES
244
+ #
245
+ # http://www.w3.org/TR/html/syntax.html#the-doctype
246
+ #
247
+ # These rules support the 3 flavours of doctypes:
248
+ #
249
+ # 1. Normal doctypes, as introduced in the HTML5 specification.
250
+ # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
251
+ # 3. Legacy doctypes
252
+ #
253
+ doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
254
+
255
+ action start_doctype {
256
+ callback_simple(id_on_doctype_start);
257
+
258
+ if ( lines > 0 )
259
+ {
260
+ advance_line(lines);
261
+
262
+ lines = 0;
263
+ }
264
+
265
+ fnext doctype;
266
+ }
267
+
268
+ # Machine for processing inline rules of a doctype.
269
+ doctype_inline := |*
270
+ ^']'* $count_newlines => {
271
+ callback(id_on_doctype_inline, data, encoding, ts, te);
272
+
273
+ if ( lines > 0 )
274
+ {
275
+ advance_line(lines);
276
+
277
+ lines = 0;
278
+ }
279
+ };
280
+
281
+ ']' => { fnext doctype; };
282
+ *|;
283
+
284
+ # Machine for processing doctypes. Doctype values such as the public
285
+ # and system IDs are treated as T_STRING tokens.
286
+ doctype := |*
287
+ 'PUBLIC' | 'SYSTEM' => {
288
+ callback(id_on_doctype_type, data, encoding, ts, te);
289
+ };
290
+
291
+ # Starts a set of inline doctype rules.
292
+ '[' => { fnext doctype_inline; };
293
+
294
+ # Lex the public/system IDs as regular strings.
295
+ squote => start_string_squote;
296
+ dquote => start_string_dquote;
297
+
298
+ identifier => {
299
+ callback(id_on_doctype_name, data, encoding, ts, te);
300
+ };
301
+
302
+ '>' => {
303
+ callback_simple(id_on_doctype_end);
304
+ fnext main;
305
+ };
306
+
307
+ newline => advance_newline;
308
+
309
+ whitespace;
310
+ *|;
311
+
312
+ # XML declaration tags
313
+ #
314
+ # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
315
+ #
316
+ xml_decl_start = '<?xml';
317
+ xml_decl_end = '?>';
318
+
319
+ action start_xml_decl {
320
+ callback_simple(id_on_xml_decl_start);
321
+ fnext xml_decl;
322
+ }
323
+
324
+ # Machine that processes the contents of an XML declaration tag.
325
+ xml_decl := |*
326
+ xml_decl_end => {
327
+ if ( lines > 0 )
328
+ {
329
+ advance_line(lines);
330
+
331
+ lines = 0;
332
+ }
333
+
334
+ callback_simple(id_on_xml_decl_end);
335
+
336
+ fnext main;
337
+ };
338
+
339
+ # Attributes and their values (e.g. version="1.0").
340
+ identifier => {
341
+ if ( lines > 0 )
342
+ {
343
+ advance_line(lines);
344
+
345
+ lines = 0;
346
+ }
347
+
348
+ callback(id_on_attribute, data, encoding, ts, te);
349
+ };
350
+
351
+ squote => start_string_squote;
352
+ dquote => start_string_dquote;
353
+
354
+ any $count_newlines;
355
+ *|;
356
+
357
+ # Elements
358
+ #
359
+ # http://www.w3.org/TR/html/syntax.html#syntax-elements
360
+ #
361
+ # Lexing of elements is broken up into different machines that handle the
362
+ # name/namespace, contents of the open tag and the body of an element. The
363
+ # body of an element is lexed using the `main` machine.
364
+ #
365
+
366
+ action start_element {
367
+ fhold;
368
+ fnext element_name;
369
+ }
370
+
371
+ action start_close_element {
372
+ fnext element_close;
373
+ }
374
+
375
+ action close_element {
376
+ callback(id_on_element_end, data, encoding, ts, te);
377
+ }
378
+
379
+ action close_element_fnext_main {
380
+ callback_simple(id_on_element_end);
381
+
382
+ fnext main;
383
+ }
384
+
385
+ element_start = '<' ident_char;
386
+ element_end = '</';
387
+
388
+ # Machine used for lexing the name/namespace of an element.
389
+ element_name := |*
390
+ identifier ':' => {
391
+ callback(id_on_element_ns, data, encoding, ts, te - 1);
392
+ };
393
+
394
+ identifier => {
395
+ callback(id_on_element_name, data, encoding, ts, te);
396
+ fnext element_head;
397
+ };
398
+ *|;
399
+
400
+ # Machine used for lexing the closing tag of an element
401
+ element_close := |*
402
+ # namespace prefixes, currently not used but allows the rule below it
403
+ # to be used for the actual element name.
404
+ identifier ':';
405
+
406
+ identifier => close_element;
407
+
408
+ '>' => {
409
+ if ( lines > 0 )
410
+ {
411
+ advance_line(lines);
412
+
413
+ lines = 0;
414
+ }
415
+
416
+ fnext main;
417
+ };
418
+
419
+ any $count_newlines;
420
+ *|;
421
+
422
+ # Characters that can be used for unquoted HTML attribute values.
423
+ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
424
+ # for more info.
425
+ html_unquoted_value =
426
+ ^(squote | dquote | whitespace_or_newline)
427
+ ^('`' | '=' | '<' | '>' | whitespace_or_newline)+;
428
+
429
+ # Machine used after matching the "=" of an attribute and just before moving
430
+ # into the actual attribute value.
431
+ attribute_pre := |*
432
+ whitespace_or_newline $count_newlines;
433
+
434
+ any => {
435
+ fhold;
436
+
437
+ if ( lines > 0 )
438
+ {
439
+ advance_line(lines);
440
+
441
+ lines = 0;
442
+ }
443
+
444
+ if ( html_p )
445
+ {
446
+ fnext html_attribute_value;
447
+ }
448
+ else
449
+ {
450
+ fnext xml_attribute_value;
451
+ }
452
+ };
453
+ *|;
454
+
455
+ # Machine used for processing HTML attribute values.
456
+ html_attribute_value := |*
457
+ squote | dquote => {
458
+ fhold;
459
+ fnext xml_attribute_value;
460
+ };
461
+
462
+ # Unquoted attribute values are lexed as if they were single quoted
463
+ # strings.
464
+ html_unquoted_value => {
465
+ callback_simple(id_on_string_squote);
466
+
467
+ callback(id_on_string_body, data, encoding, ts, te);
468
+
469
+ callback_simple(id_on_string_squote);
470
+ };
471
+
472
+ any => hold_and_return;
473
+ *|;
474
+
475
+ # Machine used for processing XML attribute values.
476
+ xml_attribute_value := |*
477
+ # The following two actions use "fnext" instead of "fcall". Combined
478
+ # with "element_head" using "fcall" to jump to this machine this means
479
+ # we can return back to "element_head" after processing a single string.
480
+ squote => {
481
+ callback_simple(id_on_string_squote);
482
+
483
+ fnext string_squote;
484
+ };
485
+
486
+ dquote => {
487
+ callback_simple(id_on_string_dquote);
488
+
489
+ fnext string_dquote;
490
+ };
491
+
492
+ any => hold_and_return;
493
+ *|;
494
+
495
+ # Machine used for processing the contents of an element's starting tag.
496
+ # This includes the name, namespace and attributes.
497
+ element_head := |*
498
+ newline => advance_newline;
499
+
500
+ # Attribute names and namespaces.
501
+ identifier ':' => {
502
+ callback(id_on_attribute_ns, data, encoding, ts, te - 1);
503
+ };
504
+
505
+ identifier => {
506
+ callback(id_on_attribute, data, encoding, ts, te);
507
+ };
508
+
509
+ # Attribute values.
510
+ '=' => {
511
+ fcall attribute_pre;
512
+ };
513
+
514
+ # We're done with the open tag of the element.
515
+ '>' => {
516
+ callback_simple(id_on_element_open_end);
517
+
518
+ if ( html_script_p() )
519
+ {
520
+ fnext html_script;
521
+ }
522
+ else if ( html_style_p() )
523
+ {
524
+ fnext html_style;
525
+ }
526
+ else
527
+ {
528
+ fnext main;
529
+ }
530
+ };
531
+
532
+ # Self closing tags.
533
+ '/>' => {
534
+ callback_simple(id_on_element_end);
535
+ fnext main;
536
+ };
537
+
538
+ any;
539
+ *|;
540
+
541
+ # Text
542
+ #
543
+ # http://www.w3.org/TR/xml/#syntax
544
+ # http://www.w3.org/TR/html/syntax.html#text
545
+ #
546
+ # Text content is everything leading up to certain special tags such as "</"
547
+ # and "<?".
548
+
549
+ action start_text {
550
+ fhold;
551
+ fnext text;
552
+ }
553
+
554
+ # These characters terminate a T_TEXT sequence and instruct Ragel to jump
555
+ # back to the main machine.
556
+ #
557
+ # Note that this only works if each sequence is exactly 2 characters
558
+ # long. Because of this "<!" is used instead of "<!--".
559
+
560
+ terminate_text = '</' | '<!' | '<?' | element_start;
561
+ allowed_text = (any* -- terminate_text) $count_newlines;
562
+
563
+ action emit_text {
564
+ callback(id_on_text, data, encoding, ts, te);
565
+
566
+ if ( lines > 0 )
567
+ {
568
+ advance_line(lines);
569
+
570
+ lines = 0;
571
+ }
572
+ }
573
+
574
+ text := |*
575
+ terminate_text | allowed_text => {
576
+ callback(id_on_text, data, encoding, ts, te);
577
+
578
+ if ( lines > 0 )
579
+ {
580
+ advance_line(lines);
581
+
582
+ lines = 0;
583
+ }
584
+
585
+ fnext main;
586
+ };
587
+
588
+ # Text followed by a special tag, such as "foo<!--"
589
+ allowed_text %{ mark = p; } terminate_text => {
590
+ callback(id_on_text, data, encoding, ts, mark);
591
+
592
+ p = mark - 1;
593
+ mark = 0;
594
+
595
+ if ( lines > 0 )
596
+ {
597
+ advance_line(lines);
598
+
599
+ lines = 0;
600
+ }
601
+
602
+ fnext main;
603
+ };
604
+ *|;
605
+
606
+ # Certain tags in HTML can contain basically anything except for the literal
607
+ # closing tag. Two examples are script and style tags. As a result of this
608
+ # we can't use the regular text machine.
609
+
610
+ literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
611
+
612
+ html_script := |*
613
+ literal_html_allowed => emit_text;
614
+ '</script>' => close_element_fnext_main;
615
+ *|;
616
+
617
+ html_style := |*
618
+ literal_html_allowed => emit_text;
619
+ '</style>' => close_element_fnext_main;
620
+ *|;
621
+
622
+ # The main machine aka the entry point of Ragel.
623
+ main := |*
624
+ doctype_start => start_doctype;
625
+ xml_decl_start => start_xml_decl;
626
+ comment_start => start_comment;
627
+ cdata_start => start_cdata;
628
+ proc_ins_start => start_proc_ins;
629
+ element_start => start_element;
630
+ element_end => start_close_element;
631
+ any => start_text;
632
+ *|;
633
+ }%%