rubyjedi-oga 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
package org.liboga.xml;
|
2
|
+
|
3
|
+
%%machine java_lexer;
|
4
|
+
|
5
|
+
import java.io.IOException;
|
6
|
+
|
7
|
+
import org.jcodings.Encoding;
|
8
|
+
|
9
|
+
import org.jruby.Ruby;
|
10
|
+
import org.jruby.RubyModule;
|
11
|
+
import org.jruby.RubyClass;
|
12
|
+
import org.jruby.RubyObject;
|
13
|
+
import org.jruby.RubyString;
|
14
|
+
import org.jruby.RubyFixnum;
|
15
|
+
import org.jruby.util.ByteList;
|
16
|
+
import org.jruby.anno.JRubyClass;
|
17
|
+
import org.jruby.anno.JRubyMethod;
|
18
|
+
import org.jruby.runtime.ThreadContext;
|
19
|
+
import org.jruby.runtime.ObjectAllocator;
|
20
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
21
|
+
|
22
|
+
/**
|
23
|
+
* Lexer support class for JRuby.
|
24
|
+
*
|
25
|
+
* The Lexer class contains the raw Ragel loop and calls back in to Ruby land
|
26
|
+
* whenever a Ragel action is needed similar to the C extension setup.
|
27
|
+
*
|
28
|
+
* This class requires Ruby land to first define the `Oga::XML` namespace.
|
29
|
+
*/
|
30
|
+
@JRubyClass(name="Oga::XML::Lexer", parent="Object")
|
31
|
+
public class Lexer extends RubyObject
|
32
|
+
{
|
33
|
+
/**
|
34
|
+
* The current Ruby runtime.
|
35
|
+
*/
|
36
|
+
private Ruby runtime;
|
37
|
+
|
38
|
+
%% write data;
|
39
|
+
|
40
|
+
/* Used by Ragel to keep track of the current state. */
|
41
|
+
int act;
|
42
|
+
int cs;
|
43
|
+
int top;
|
44
|
+
int lines;
|
45
|
+
int[] stack;
|
46
|
+
|
47
|
+
/**
|
48
|
+
* Sets up the current class in the Ruby runtime.
|
49
|
+
*/
|
50
|
+
public static void load(Ruby runtime)
|
51
|
+
{
|
52
|
+
RubyModule xml = (RubyModule) runtime.getModule("Oga")
|
53
|
+
.getConstant("XML");
|
54
|
+
|
55
|
+
RubyClass lexer = xml.defineClassUnder(
|
56
|
+
"Lexer",
|
57
|
+
runtime.getObject(),
|
58
|
+
ALLOCATOR
|
59
|
+
);
|
60
|
+
|
61
|
+
lexer.defineAnnotatedMethods(Lexer.class);
|
62
|
+
}
|
63
|
+
|
64
|
+
private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
|
65
|
+
{
|
66
|
+
public IRubyObject allocate(Ruby runtime, RubyClass klass)
|
67
|
+
{
|
68
|
+
return new org.liboga.xml.Lexer(runtime, klass);
|
69
|
+
}
|
70
|
+
};
|
71
|
+
|
72
|
+
public Lexer(Ruby runtime, RubyClass klass)
|
73
|
+
{
|
74
|
+
super(runtime, klass);
|
75
|
+
|
76
|
+
this.runtime = runtime;
|
77
|
+
}
|
78
|
+
|
79
|
+
/**
|
80
|
+
* Runs the bulk of the Ragel loop and calls back in to Ruby.
|
81
|
+
*
|
82
|
+
* This method pulls its data in from the instance variable `@data`. The
|
83
|
+
* Ruby side of the Lexer class should set this variable to a String in its
|
84
|
+
* constructor method. Encodings are passed along to make sure that token
|
85
|
+
* values share the same encoding as the input.
|
86
|
+
*
|
87
|
+
* This method always returns nil.
|
88
|
+
*/
|
89
|
+
@JRubyMethod
|
90
|
+
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
91
|
+
{
|
92
|
+
Boolean html_p = this.callMethod(context, "html?").isTrue();
|
93
|
+
|
94
|
+
Encoding encoding = rb_str.getEncoding();
|
95
|
+
|
96
|
+
byte[] data = rb_str.getBytes();
|
97
|
+
|
98
|
+
int ts = 0;
|
99
|
+
int te = 0;
|
100
|
+
int p = 0;
|
101
|
+
int mark = 0;
|
102
|
+
int lines = this.lines;
|
103
|
+
int pe = data.length;
|
104
|
+
int eof = data.length;
|
105
|
+
|
106
|
+
String id_advance_line = "advance_line";
|
107
|
+
String id_on_attribute = "on_attribute";
|
108
|
+
String id_on_attribute_ns = "on_attribute_ns";
|
109
|
+
String id_on_cdata_start = "on_cdata_start";
|
110
|
+
String id_on_cdata_body = "on_cdata_body";
|
111
|
+
String id_on_cdata_end = "on_cdata_end";
|
112
|
+
String id_on_comment_start = "on_comment_start";
|
113
|
+
String id_on_comment_body = "on_comment_body";
|
114
|
+
String id_on_comment_end = "on_comment_end";
|
115
|
+
String id_on_doctype_end = "on_doctype_end";
|
116
|
+
String id_on_doctype_inline = "on_doctype_inline";
|
117
|
+
String id_on_doctype_name = "on_doctype_name";
|
118
|
+
String id_on_doctype_start = "on_doctype_start";
|
119
|
+
String id_on_doctype_type = "on_doctype_type";
|
120
|
+
String id_on_element_end = "on_element_end";
|
121
|
+
String id_on_element_name = "on_element_name";
|
122
|
+
String id_on_element_ns = "on_element_ns";
|
123
|
+
String id_on_element_open_end = "on_element_open_end";
|
124
|
+
String id_on_proc_ins_end = "on_proc_ins_end";
|
125
|
+
String id_on_proc_ins_name = "on_proc_ins_name";
|
126
|
+
String id_on_proc_ins_start = "on_proc_ins_start";
|
127
|
+
String id_on_proc_ins_body = "on_proc_ins_body";
|
128
|
+
String id_on_string_body = "on_string_body";
|
129
|
+
String id_on_string_dquote = "on_string_dquote";
|
130
|
+
String id_on_string_squote = "on_string_squote";
|
131
|
+
String id_on_text = "on_text";
|
132
|
+
String id_on_xml_decl_end = "on_xml_decl_end";
|
133
|
+
String id_on_xml_decl_start = "on_xml_decl_start";
|
134
|
+
|
135
|
+
%% write exec;
|
136
|
+
|
137
|
+
this.lines = lines;
|
138
|
+
|
139
|
+
return context.nil;
|
140
|
+
}
|
141
|
+
|
142
|
+
/**
|
143
|
+
* Resets the internal state of the lexer.
|
144
|
+
*/
|
145
|
+
@JRubyMethod
|
146
|
+
public IRubyObject reset_native(ThreadContext context)
|
147
|
+
{
|
148
|
+
this.act = 0;
|
149
|
+
this.top = 0;
|
150
|
+
this.stack = new int[4];
|
151
|
+
this.cs = java_lexer_start;
|
152
|
+
|
153
|
+
return context.nil;
|
154
|
+
}
|
155
|
+
|
156
|
+
/**
|
157
|
+
* Calls back in to Ruby land passing the current token value along.
|
158
|
+
*
|
159
|
+
* This method calls back in to Ruby land based on the method name
|
160
|
+
* specified in `name`. The Ruby callback should take one argument. This
|
161
|
+
* argument will be a String containing the value of the current token.
|
162
|
+
*/
|
163
|
+
public void callback(String name, byte[] data, Encoding enc, int ts, int te)
|
164
|
+
{
|
165
|
+
ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
|
166
|
+
|
167
|
+
RubyString value = this.runtime.newString(bytelist);
|
168
|
+
|
169
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
170
|
+
|
171
|
+
this.callMethod(context, name, value);
|
172
|
+
}
|
173
|
+
|
174
|
+
/**
|
175
|
+
* Calls back in to Ruby land without passing any arguments.
|
176
|
+
*/
|
177
|
+
public void callback_simple(String name)
|
178
|
+
{
|
179
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
180
|
+
|
181
|
+
this.callMethod(context, name);
|
182
|
+
}
|
183
|
+
|
184
|
+
/**
|
185
|
+
* Advances the line number by `amount` lines.
|
186
|
+
*/
|
187
|
+
public void advance_line(int amount)
|
188
|
+
{
|
189
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
190
|
+
RubyFixnum lines = this.runtime.newFixnum(amount);
|
191
|
+
|
192
|
+
this.callMethod(context, "advance_line", lines);
|
193
|
+
}
|
194
|
+
|
195
|
+
/**
|
196
|
+
* @see Oga::XML::Lexer#html_script?
|
197
|
+
*/
|
198
|
+
public Boolean html_script_p()
|
199
|
+
{
|
200
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
201
|
+
|
202
|
+
return this.callMethod(context, "html_script?").isTrue();
|
203
|
+
}
|
204
|
+
|
205
|
+
/**
|
206
|
+
* @see Oga::XML::Lexer#html_style?
|
207
|
+
*/
|
208
|
+
public Boolean html_style_p()
|
209
|
+
{
|
210
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
211
|
+
|
212
|
+
return this.callMethod(context, "html_style?").isTrue();
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
%%{
|
217
|
+
variable act this.act;
|
218
|
+
variable cs this.cs;
|
219
|
+
variable stack this.stack;
|
220
|
+
variable top this.top;
|
221
|
+
|
222
|
+
include base_lexer "base_lexer.rl";
|
223
|
+
}%%
|
@@ -0,0 +1,633 @@
|
|
1
|
+
%%machine base_lexer;
|
2
|
+
|
3
|
+
%%{
|
4
|
+
##
|
5
|
+
# Base grammar for the XML lexer.
|
6
|
+
#
|
7
|
+
# This grammar is shared between the C and Java extensions. As a result of
|
8
|
+
# this you should **not** include language specific code in Ragel
|
9
|
+
# actions/callbacks.
|
10
|
+
#
|
11
|
+
# To call back in to Ruby you can use one of the following two functions:
|
12
|
+
#
|
13
|
+
# * callback
|
14
|
+
# * callback_simple
|
15
|
+
#
|
16
|
+
# The first function takes 5 arguments:
|
17
|
+
#
|
18
|
+
# * The name of the Ruby method to call.
|
19
|
+
# * The input data.
|
20
|
+
# * The encoding of the input data.
|
21
|
+
# * The start of the current buffer.
|
22
|
+
# * The end of the current buffer.
|
23
|
+
#
|
24
|
+
# The function callback_simple only takes one argument: the name of the
|
25
|
+
# method to call. This function should be used for callbacks that don't
|
26
|
+
# require any values.
|
27
|
+
#
|
28
|
+
# When you call a method in Ruby make sure that said method is defined as
|
29
|
+
# an instance method in the `Oga::XML::Lexer` class.
|
30
|
+
#
|
31
|
+
# The name of the callback to invoke should be an identifier starting with
|
32
|
+
# "id_". The identifier should be defined in the associated C and Java code.
|
33
|
+
# In case of C code its value should be a Symbol as a ID object, for Java
|
34
|
+
# it should be a String. For example:
|
35
|
+
#
|
36
|
+
# ID id_foo = rb_intern("foo");
|
37
|
+
#
|
38
|
+
# And for Java:
|
39
|
+
#
|
40
|
+
# String id_foo = "foo";
|
41
|
+
#
|
42
|
+
# ## Machine Transitions
|
43
|
+
#
|
44
|
+
# To transition from one machine to another always use `fnext` instead of
|
45
|
+
# `fcall` and `fret`. This removes the need for the code to keep track of a
|
46
|
+
# stack.
|
47
|
+
#
|
48
|
+
|
49
|
+
newline = '\r\n' | '\n' | '\r';
|
50
|
+
whitespace = [ \t];
|
51
|
+
ident_char = [a-zA-Z0-9\-_\.];
|
52
|
+
identifier = ident_char+;
|
53
|
+
|
54
|
+
whitespace_or_newline = whitespace | newline;
|
55
|
+
|
56
|
+
action count_newlines {
|
57
|
+
if ( fc == '\n' ) lines++;
|
58
|
+
}
|
59
|
+
|
60
|
+
action advance_newline {
|
61
|
+
advance_line(1);
|
62
|
+
}
|
63
|
+
|
64
|
+
action hold_and_return {
|
65
|
+
fhold;
|
66
|
+
fret;
|
67
|
+
}
|
68
|
+
|
69
|
+
# Comments
|
70
|
+
#
|
71
|
+
# http://www.w3.org/TR/html/syntax.html#comments
|
72
|
+
#
|
73
|
+
# Unlike the W3C specification these rules *do* allow character sequences
|
74
|
+
# such as `--` and `->`. Putting extra checks in for these sequences would
|
75
|
+
# actually make the rules/actions more complex.
|
76
|
+
#
|
77
|
+
|
78
|
+
comment_start = '<!--';
|
79
|
+
comment_end = '-->';
|
80
|
+
|
81
|
+
# Everything except "-" OR a single "-"
|
82
|
+
comment_allowed = (^'-'+ | '-') $count_newlines;
|
83
|
+
|
84
|
+
action start_comment {
|
85
|
+
callback_simple(id_on_comment_start);
|
86
|
+
|
87
|
+
fnext comment_body;
|
88
|
+
}
|
89
|
+
|
90
|
+
comment_body := |*
|
91
|
+
comment_allowed => {
|
92
|
+
callback(id_on_comment_body, data, encoding, ts, te);
|
93
|
+
|
94
|
+
if ( lines > 0 )
|
95
|
+
{
|
96
|
+
advance_line(lines);
|
97
|
+
|
98
|
+
lines = 0;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
comment_end => {
|
103
|
+
callback_simple(id_on_comment_end);
|
104
|
+
|
105
|
+
fnext main;
|
106
|
+
};
|
107
|
+
*|;
|
108
|
+
|
109
|
+
# CDATA
|
110
|
+
#
|
111
|
+
# http://www.w3.org/TR/html/syntax.html#cdata-sections
|
112
|
+
#
|
113
|
+
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
114
|
+
# support them but treats their contents as plain text.
|
115
|
+
#
|
116
|
+
|
117
|
+
cdata_start = '<![CDATA[';
|
118
|
+
cdata_end = ']]>';
|
119
|
+
|
120
|
+
# Everything except "]" OR a single "]"
|
121
|
+
cdata_allowed = (^']'+ | ']') $count_newlines;
|
122
|
+
|
123
|
+
action start_cdata {
|
124
|
+
callback_simple(id_on_cdata_start);
|
125
|
+
|
126
|
+
fnext cdata_body;
|
127
|
+
}
|
128
|
+
|
129
|
+
cdata_body := |*
|
130
|
+
cdata_allowed => {
|
131
|
+
callback(id_on_cdata_body, data, encoding, ts, te);
|
132
|
+
|
133
|
+
if ( lines > 0 )
|
134
|
+
{
|
135
|
+
advance_line(lines);
|
136
|
+
|
137
|
+
lines = 0;
|
138
|
+
}
|
139
|
+
};
|
140
|
+
|
141
|
+
cdata_end => {
|
142
|
+
callback_simple(id_on_cdata_end);
|
143
|
+
|
144
|
+
fnext main;
|
145
|
+
};
|
146
|
+
*|;
|
147
|
+
|
148
|
+
# Processing Instructions
|
149
|
+
#
|
150
|
+
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
151
|
+
# http://en.wikipedia.org/wiki/Processing_Instruction
|
152
|
+
#
|
153
|
+
# These are tags meant to be used by parsers/libraries for custom behaviour.
|
154
|
+
# One example are the tags used by PHP: <?php and ?>. Note that the XML
|
155
|
+
# declaration tags (<?xml ?>) are not considered to be a processing
|
156
|
+
# instruction.
|
157
|
+
#
|
158
|
+
|
159
|
+
proc_ins_start = '<?' identifier;
|
160
|
+
proc_ins_end = '?>';
|
161
|
+
|
162
|
+
# Everything except "?" OR a single "?"
|
163
|
+
proc_ins_allowed = (^'?'+ | '?') $count_newlines;
|
164
|
+
|
165
|
+
action start_proc_ins {
|
166
|
+
callback_simple(id_on_proc_ins_start);
|
167
|
+
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
168
|
+
|
169
|
+
fnext proc_ins_body;
|
170
|
+
}
|
171
|
+
|
172
|
+
proc_ins_body := |*
|
173
|
+
proc_ins_allowed => {
|
174
|
+
callback(id_on_proc_ins_body, data, encoding, ts, te);
|
175
|
+
|
176
|
+
if ( lines > 0 )
|
177
|
+
{
|
178
|
+
advance_line(lines);
|
179
|
+
|
180
|
+
lines = 0;
|
181
|
+
}
|
182
|
+
};
|
183
|
+
|
184
|
+
proc_ins_end => {
|
185
|
+
callback_simple(id_on_proc_ins_end);
|
186
|
+
|
187
|
+
fnext main;
|
188
|
+
};
|
189
|
+
*|;
|
190
|
+
|
191
|
+
# Strings
|
192
|
+
#
|
193
|
+
# Strings in HTML can either be single or double quoted. If a string
|
194
|
+
# starts with one of these quotes it must be closed with the same type
|
195
|
+
# of quote.
|
196
|
+
#
|
197
|
+
dquote = '"';
|
198
|
+
squote = "'";
|
199
|
+
|
200
|
+
action emit_string {
|
201
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
202
|
+
|
203
|
+
if ( lines > 0 )
|
204
|
+
{
|
205
|
+
advance_line(lines);
|
206
|
+
|
207
|
+
lines = 0;
|
208
|
+
}
|
209
|
+
}
|
210
|
+
|
211
|
+
action start_string_squote {
|
212
|
+
callback_simple(id_on_string_squote);
|
213
|
+
|
214
|
+
fcall string_squote;
|
215
|
+
}
|
216
|
+
|
217
|
+
action start_string_dquote {
|
218
|
+
callback_simple(id_on_string_dquote);
|
219
|
+
|
220
|
+
fcall string_dquote;
|
221
|
+
}
|
222
|
+
|
223
|
+
string_squote := |*
|
224
|
+
^squote* $count_newlines => emit_string;
|
225
|
+
|
226
|
+
squote => {
|
227
|
+
callback_simple(id_on_string_squote);
|
228
|
+
|
229
|
+
fret;
|
230
|
+
};
|
231
|
+
*|;
|
232
|
+
|
233
|
+
string_dquote := |*
|
234
|
+
^dquote* $count_newlines => emit_string;
|
235
|
+
|
236
|
+
dquote => {
|
237
|
+
callback_simple(id_on_string_dquote);
|
238
|
+
|
239
|
+
fret;
|
240
|
+
};
|
241
|
+
*|;
|
242
|
+
|
243
|
+
# DOCTYPES
|
244
|
+
#
|
245
|
+
# http://www.w3.org/TR/html/syntax.html#the-doctype
|
246
|
+
#
|
247
|
+
# These rules support the 3 flavours of doctypes:
|
248
|
+
#
|
249
|
+
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
250
|
+
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
251
|
+
# 3. Legacy doctypes
|
252
|
+
#
|
253
|
+
doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
|
254
|
+
|
255
|
+
action start_doctype {
|
256
|
+
callback_simple(id_on_doctype_start);
|
257
|
+
|
258
|
+
if ( lines > 0 )
|
259
|
+
{
|
260
|
+
advance_line(lines);
|
261
|
+
|
262
|
+
lines = 0;
|
263
|
+
}
|
264
|
+
|
265
|
+
fnext doctype;
|
266
|
+
}
|
267
|
+
|
268
|
+
# Machine for processing inline rules of a doctype.
|
269
|
+
doctype_inline := |*
|
270
|
+
^']'* $count_newlines => {
|
271
|
+
callback(id_on_doctype_inline, data, encoding, ts, te);
|
272
|
+
|
273
|
+
if ( lines > 0 )
|
274
|
+
{
|
275
|
+
advance_line(lines);
|
276
|
+
|
277
|
+
lines = 0;
|
278
|
+
}
|
279
|
+
};
|
280
|
+
|
281
|
+
']' => { fnext doctype; };
|
282
|
+
*|;
|
283
|
+
|
284
|
+
# Machine for processing doctypes. Doctype values such as the public
|
285
|
+
# and system IDs are treated as T_STRING tokens.
|
286
|
+
doctype := |*
|
287
|
+
'PUBLIC' | 'SYSTEM' => {
|
288
|
+
callback(id_on_doctype_type, data, encoding, ts, te);
|
289
|
+
};
|
290
|
+
|
291
|
+
# Starts a set of inline doctype rules.
|
292
|
+
'[' => { fnext doctype_inline; };
|
293
|
+
|
294
|
+
# Lex the public/system IDs as regular strings.
|
295
|
+
squote => start_string_squote;
|
296
|
+
dquote => start_string_dquote;
|
297
|
+
|
298
|
+
identifier => {
|
299
|
+
callback(id_on_doctype_name, data, encoding, ts, te);
|
300
|
+
};
|
301
|
+
|
302
|
+
'>' => {
|
303
|
+
callback_simple(id_on_doctype_end);
|
304
|
+
fnext main;
|
305
|
+
};
|
306
|
+
|
307
|
+
newline => advance_newline;
|
308
|
+
|
309
|
+
whitespace;
|
310
|
+
*|;
|
311
|
+
|
312
|
+
# XML declaration tags
|
313
|
+
#
|
314
|
+
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
315
|
+
#
|
316
|
+
xml_decl_start = '<?xml';
|
317
|
+
xml_decl_end = '?>';
|
318
|
+
|
319
|
+
action start_xml_decl {
|
320
|
+
callback_simple(id_on_xml_decl_start);
|
321
|
+
fnext xml_decl;
|
322
|
+
}
|
323
|
+
|
324
|
+
# Machine that processes the contents of an XML declaration tag.
|
325
|
+
xml_decl := |*
|
326
|
+
xml_decl_end => {
|
327
|
+
if ( lines > 0 )
|
328
|
+
{
|
329
|
+
advance_line(lines);
|
330
|
+
|
331
|
+
lines = 0;
|
332
|
+
}
|
333
|
+
|
334
|
+
callback_simple(id_on_xml_decl_end);
|
335
|
+
|
336
|
+
fnext main;
|
337
|
+
};
|
338
|
+
|
339
|
+
# Attributes and their values (e.g. version="1.0").
|
340
|
+
identifier => {
|
341
|
+
if ( lines > 0 )
|
342
|
+
{
|
343
|
+
advance_line(lines);
|
344
|
+
|
345
|
+
lines = 0;
|
346
|
+
}
|
347
|
+
|
348
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
349
|
+
};
|
350
|
+
|
351
|
+
squote => start_string_squote;
|
352
|
+
dquote => start_string_dquote;
|
353
|
+
|
354
|
+
any $count_newlines;
|
355
|
+
*|;
|
356
|
+
|
357
|
+
# Elements
|
358
|
+
#
|
359
|
+
# http://www.w3.org/TR/html/syntax.html#syntax-elements
|
360
|
+
#
|
361
|
+
# Lexing of elements is broken up into different machines that handle the
|
362
|
+
# name/namespace, contents of the open tag and the body of an element. The
|
363
|
+
# body of an element is lexed using the `main` machine.
|
364
|
+
#
|
365
|
+
|
366
|
+
action start_element {
|
367
|
+
fhold;
|
368
|
+
fnext element_name;
|
369
|
+
}
|
370
|
+
|
371
|
+
action start_close_element {
|
372
|
+
fnext element_close;
|
373
|
+
}
|
374
|
+
|
375
|
+
action close_element {
|
376
|
+
callback(id_on_element_end, data, encoding, ts, te);
|
377
|
+
}
|
378
|
+
|
379
|
+
action close_element_fnext_main {
|
380
|
+
callback_simple(id_on_element_end);
|
381
|
+
|
382
|
+
fnext main;
|
383
|
+
}
|
384
|
+
|
385
|
+
element_start = '<' ident_char;
|
386
|
+
element_end = '</';
|
387
|
+
|
388
|
+
# Machine used for lexing the name/namespace of an element.
|
389
|
+
element_name := |*
|
390
|
+
identifier ':' => {
|
391
|
+
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
392
|
+
};
|
393
|
+
|
394
|
+
identifier => {
|
395
|
+
callback(id_on_element_name, data, encoding, ts, te);
|
396
|
+
fnext element_head;
|
397
|
+
};
|
398
|
+
*|;
|
399
|
+
|
400
|
+
# Machine used for lexing the closing tag of an element
|
401
|
+
element_close := |*
|
402
|
+
# namespace prefixes, currently not used but allows the rule below it
|
403
|
+
# to be used for the actual element name.
|
404
|
+
identifier ':';
|
405
|
+
|
406
|
+
identifier => close_element;
|
407
|
+
|
408
|
+
'>' => {
|
409
|
+
if ( lines > 0 )
|
410
|
+
{
|
411
|
+
advance_line(lines);
|
412
|
+
|
413
|
+
lines = 0;
|
414
|
+
}
|
415
|
+
|
416
|
+
fnext main;
|
417
|
+
};
|
418
|
+
|
419
|
+
any $count_newlines;
|
420
|
+
*|;
|
421
|
+
|
422
|
+
# Characters that can be used for unquoted HTML attribute values.
|
423
|
+
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
424
|
+
# for more info.
|
425
|
+
html_unquoted_value =
|
426
|
+
^(squote | dquote | whitespace_or_newline)
|
427
|
+
^('`' | '=' | '<' | '>' | whitespace_or_newline)+;
|
428
|
+
|
429
|
+
# Machine used after matching the "=" of an attribute and just before moving
|
430
|
+
# into the actual attribute value.
|
431
|
+
attribute_pre := |*
|
432
|
+
whitespace_or_newline $count_newlines;
|
433
|
+
|
434
|
+
any => {
|
435
|
+
fhold;
|
436
|
+
|
437
|
+
if ( lines > 0 )
|
438
|
+
{
|
439
|
+
advance_line(lines);
|
440
|
+
|
441
|
+
lines = 0;
|
442
|
+
}
|
443
|
+
|
444
|
+
if ( html_p )
|
445
|
+
{
|
446
|
+
fnext html_attribute_value;
|
447
|
+
}
|
448
|
+
else
|
449
|
+
{
|
450
|
+
fnext xml_attribute_value;
|
451
|
+
}
|
452
|
+
};
|
453
|
+
*|;
|
454
|
+
|
455
|
+
# Machine used for processing HTML attribute values.
|
456
|
+
html_attribute_value := |*
|
457
|
+
squote | dquote => {
|
458
|
+
fhold;
|
459
|
+
fnext xml_attribute_value;
|
460
|
+
};
|
461
|
+
|
462
|
+
# Unquoted attribute values are lexed as if they were single quoted
|
463
|
+
# strings.
|
464
|
+
html_unquoted_value => {
|
465
|
+
callback_simple(id_on_string_squote);
|
466
|
+
|
467
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
468
|
+
|
469
|
+
callback_simple(id_on_string_squote);
|
470
|
+
};
|
471
|
+
|
472
|
+
any => hold_and_return;
|
473
|
+
*|;
|
474
|
+
|
475
|
+
# Machine used for processing XML attribute values.
|
476
|
+
xml_attribute_value := |*
|
477
|
+
# The following two actions use "fnext" instead of "fcall". Combined
|
478
|
+
# with "element_head" using "fcall" to jump to this machine this means
|
479
|
+
# we can return back to "element_head" after processing a single string.
|
480
|
+
squote => {
|
481
|
+
callback_simple(id_on_string_squote);
|
482
|
+
|
483
|
+
fnext string_squote;
|
484
|
+
};
|
485
|
+
|
486
|
+
dquote => {
|
487
|
+
callback_simple(id_on_string_dquote);
|
488
|
+
|
489
|
+
fnext string_dquote;
|
490
|
+
};
|
491
|
+
|
492
|
+
any => hold_and_return;
|
493
|
+
*|;
|
494
|
+
|
495
|
+
# Machine used for processing the contents of an element's starting tag.
|
496
|
+
# This includes the name, namespace and attributes.
|
497
|
+
element_head := |*
|
498
|
+
newline => advance_newline;
|
499
|
+
|
500
|
+
# Attribute names and namespaces.
|
501
|
+
identifier ':' => {
|
502
|
+
callback(id_on_attribute_ns, data, encoding, ts, te - 1);
|
503
|
+
};
|
504
|
+
|
505
|
+
identifier => {
|
506
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
507
|
+
};
|
508
|
+
|
509
|
+
# Attribute values.
|
510
|
+
'=' => {
|
511
|
+
fcall attribute_pre;
|
512
|
+
};
|
513
|
+
|
514
|
+
# We're done with the open tag of the element.
|
515
|
+
'>' => {
|
516
|
+
callback_simple(id_on_element_open_end);
|
517
|
+
|
518
|
+
if ( html_script_p() )
|
519
|
+
{
|
520
|
+
fnext html_script;
|
521
|
+
}
|
522
|
+
else if ( html_style_p() )
|
523
|
+
{
|
524
|
+
fnext html_style;
|
525
|
+
}
|
526
|
+
else
|
527
|
+
{
|
528
|
+
fnext main;
|
529
|
+
}
|
530
|
+
};
|
531
|
+
|
532
|
+
# Self closing tags.
|
533
|
+
'/>' => {
|
534
|
+
callback_simple(id_on_element_end);
|
535
|
+
fnext main;
|
536
|
+
};
|
537
|
+
|
538
|
+
any;
|
539
|
+
*|;
|
540
|
+
|
541
|
+
# Text
|
542
|
+
#
|
543
|
+
# http://www.w3.org/TR/xml/#syntax
|
544
|
+
# http://www.w3.org/TR/html/syntax.html#text
|
545
|
+
#
|
546
|
+
# Text content is everything leading up to certain special tags such as "</"
|
547
|
+
# and "<?".
|
548
|
+
|
549
|
+
action start_text {
|
550
|
+
fhold;
|
551
|
+
fnext text;
|
552
|
+
}
|
553
|
+
|
554
|
+
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
|
555
|
+
# back to the main machine.
|
556
|
+
#
|
557
|
+
# Note that this only works if each sequence is exactly 2 characters
|
558
|
+
# long. Because of this "<!" is used instead of "<!--".
|
559
|
+
|
560
|
+
terminate_text = '</' | '<!' | '<?' | element_start;
|
561
|
+
allowed_text = (any* -- terminate_text) $count_newlines;
|
562
|
+
|
563
|
+
action emit_text {
|
564
|
+
callback(id_on_text, data, encoding, ts, te);
|
565
|
+
|
566
|
+
if ( lines > 0 )
|
567
|
+
{
|
568
|
+
advance_line(lines);
|
569
|
+
|
570
|
+
lines = 0;
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
text := |*
|
575
|
+
terminate_text | allowed_text => {
|
576
|
+
callback(id_on_text, data, encoding, ts, te);
|
577
|
+
|
578
|
+
if ( lines > 0 )
|
579
|
+
{
|
580
|
+
advance_line(lines);
|
581
|
+
|
582
|
+
lines = 0;
|
583
|
+
}
|
584
|
+
|
585
|
+
fnext main;
|
586
|
+
};
|
587
|
+
|
588
|
+
# Text followed by a special tag, such as "foo<!--"
|
589
|
+
allowed_text %{ mark = p; } terminate_text => {
|
590
|
+
callback(id_on_text, data, encoding, ts, mark);
|
591
|
+
|
592
|
+
p = mark - 1;
|
593
|
+
mark = 0;
|
594
|
+
|
595
|
+
if ( lines > 0 )
|
596
|
+
{
|
597
|
+
advance_line(lines);
|
598
|
+
|
599
|
+
lines = 0;
|
600
|
+
}
|
601
|
+
|
602
|
+
fnext main;
|
603
|
+
};
|
604
|
+
*|;
|
605
|
+
|
606
|
+
# Certain tags in HTML can contain basically anything except for the literal
|
607
|
+
# closing tag. Two examples are script and style tags. As a result of this
|
608
|
+
# we can't use the regular text machine.
|
609
|
+
|
610
|
+
literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
|
611
|
+
|
612
|
+
html_script := |*
|
613
|
+
literal_html_allowed => emit_text;
|
614
|
+
'</script>' => close_element_fnext_main;
|
615
|
+
*|;
|
616
|
+
|
617
|
+
html_style := |*
|
618
|
+
literal_html_allowed => emit_text;
|
619
|
+
'</style>' => close_element_fnext_main;
|
620
|
+
*|;
|
621
|
+
|
622
|
+
# The main machine aka the entry point of Ragel.
|
623
|
+
main := |*
|
624
|
+
doctype_start => start_doctype;
|
625
|
+
xml_decl_start => start_xml_decl;
|
626
|
+
comment_start => start_comment;
|
627
|
+
cdata_start => start_cdata;
|
628
|
+
proc_ins_start => start_proc_ins;
|
629
|
+
element_start => start_element;
|
630
|
+
element_end => start_close_element;
|
631
|
+
any => start_text;
|
632
|
+
*|;
|
633
|
+
}%%
|