rubyjedi-oga 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
package org.liboga.xml;
|
2
|
+
|
3
|
+
%%machine java_lexer;
|
4
|
+
|
5
|
+
import java.io.IOException;
|
6
|
+
|
7
|
+
import org.jcodings.Encoding;
|
8
|
+
|
9
|
+
import org.jruby.Ruby;
|
10
|
+
import org.jruby.RubyModule;
|
11
|
+
import org.jruby.RubyClass;
|
12
|
+
import org.jruby.RubyObject;
|
13
|
+
import org.jruby.RubyString;
|
14
|
+
import org.jruby.RubyFixnum;
|
15
|
+
import org.jruby.util.ByteList;
|
16
|
+
import org.jruby.anno.JRubyClass;
|
17
|
+
import org.jruby.anno.JRubyMethod;
|
18
|
+
import org.jruby.runtime.ThreadContext;
|
19
|
+
import org.jruby.runtime.ObjectAllocator;
|
20
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
21
|
+
|
22
|
+
/**
|
23
|
+
* Lexer support class for JRuby.
|
24
|
+
*
|
25
|
+
* The Lexer class contains the raw Ragel loop and calls back in to Ruby land
|
26
|
+
* whenever a Ragel action is needed similar to the C extension setup.
|
27
|
+
*
|
28
|
+
* This class requires Ruby land to first define the `Oga::XML` namespace.
|
29
|
+
*/
|
30
|
+
@JRubyClass(name="Oga::XML::Lexer", parent="Object")
|
31
|
+
public class Lexer extends RubyObject
|
32
|
+
{
|
33
|
+
/**
|
34
|
+
* The current Ruby runtime.
|
35
|
+
*/
|
36
|
+
private Ruby runtime;
|
37
|
+
|
38
|
+
%% write data;
|
39
|
+
|
40
|
+
/* Used by Ragel to keep track of the current state. */
|
41
|
+
int act;
|
42
|
+
int cs;
|
43
|
+
int top;
|
44
|
+
int lines;
|
45
|
+
int[] stack;
|
46
|
+
|
47
|
+
/**
|
48
|
+
* Sets up the current class in the Ruby runtime.
|
49
|
+
*/
|
50
|
+
public static void load(Ruby runtime)
|
51
|
+
{
|
52
|
+
RubyModule xml = (RubyModule) runtime.getModule("Oga")
|
53
|
+
.getConstant("XML");
|
54
|
+
|
55
|
+
RubyClass lexer = xml.defineClassUnder(
|
56
|
+
"Lexer",
|
57
|
+
runtime.getObject(),
|
58
|
+
ALLOCATOR
|
59
|
+
);
|
60
|
+
|
61
|
+
lexer.defineAnnotatedMethods(Lexer.class);
|
62
|
+
}
|
63
|
+
|
64
|
+
private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
|
65
|
+
{
|
66
|
+
public IRubyObject allocate(Ruby runtime, RubyClass klass)
|
67
|
+
{
|
68
|
+
return new org.liboga.xml.Lexer(runtime, klass);
|
69
|
+
}
|
70
|
+
};
|
71
|
+
|
72
|
+
public Lexer(Ruby runtime, RubyClass klass)
|
73
|
+
{
|
74
|
+
super(runtime, klass);
|
75
|
+
|
76
|
+
this.runtime = runtime;
|
77
|
+
}
|
78
|
+
|
79
|
+
/**
|
80
|
+
* Runs the bulk of the Ragel loop and calls back in to Ruby.
|
81
|
+
*
|
82
|
+
* This method pulls its data in from the instance variable `@data`. The
|
83
|
+
* Ruby side of the Lexer class should set this variable to a String in its
|
84
|
+
* constructor method. Encodings are passed along to make sure that token
|
85
|
+
* values share the same encoding as the input.
|
86
|
+
*
|
87
|
+
* This method always returns nil.
|
88
|
+
*/
|
89
|
+
@JRubyMethod
|
90
|
+
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
91
|
+
{
|
92
|
+
Boolean html_p = this.callMethod(context, "html?").isTrue();
|
93
|
+
|
94
|
+
Encoding encoding = rb_str.getEncoding();
|
95
|
+
|
96
|
+
byte[] data = rb_str.getBytes();
|
97
|
+
|
98
|
+
int ts = 0;
|
99
|
+
int te = 0;
|
100
|
+
int p = 0;
|
101
|
+
int mark = 0;
|
102
|
+
int lines = this.lines;
|
103
|
+
int pe = data.length;
|
104
|
+
int eof = data.length;
|
105
|
+
|
106
|
+
String id_advance_line = "advance_line";
|
107
|
+
String id_on_attribute = "on_attribute";
|
108
|
+
String id_on_attribute_ns = "on_attribute_ns";
|
109
|
+
String id_on_cdata_start = "on_cdata_start";
|
110
|
+
String id_on_cdata_body = "on_cdata_body";
|
111
|
+
String id_on_cdata_end = "on_cdata_end";
|
112
|
+
String id_on_comment_start = "on_comment_start";
|
113
|
+
String id_on_comment_body = "on_comment_body";
|
114
|
+
String id_on_comment_end = "on_comment_end";
|
115
|
+
String id_on_doctype_end = "on_doctype_end";
|
116
|
+
String id_on_doctype_inline = "on_doctype_inline";
|
117
|
+
String id_on_doctype_name = "on_doctype_name";
|
118
|
+
String id_on_doctype_start = "on_doctype_start";
|
119
|
+
String id_on_doctype_type = "on_doctype_type";
|
120
|
+
String id_on_element_end = "on_element_end";
|
121
|
+
String id_on_element_name = "on_element_name";
|
122
|
+
String id_on_element_ns = "on_element_ns";
|
123
|
+
String id_on_element_open_end = "on_element_open_end";
|
124
|
+
String id_on_proc_ins_end = "on_proc_ins_end";
|
125
|
+
String id_on_proc_ins_name = "on_proc_ins_name";
|
126
|
+
String id_on_proc_ins_start = "on_proc_ins_start";
|
127
|
+
String id_on_proc_ins_body = "on_proc_ins_body";
|
128
|
+
String id_on_string_body = "on_string_body";
|
129
|
+
String id_on_string_dquote = "on_string_dquote";
|
130
|
+
String id_on_string_squote = "on_string_squote";
|
131
|
+
String id_on_text = "on_text";
|
132
|
+
String id_on_xml_decl_end = "on_xml_decl_end";
|
133
|
+
String id_on_xml_decl_start = "on_xml_decl_start";
|
134
|
+
|
135
|
+
%% write exec;
|
136
|
+
|
137
|
+
this.lines = lines;
|
138
|
+
|
139
|
+
return context.nil;
|
140
|
+
}
|
141
|
+
|
142
|
+
/**
|
143
|
+
* Resets the internal state of the lexer.
|
144
|
+
*/
|
145
|
+
@JRubyMethod
|
146
|
+
public IRubyObject reset_native(ThreadContext context)
|
147
|
+
{
|
148
|
+
this.act = 0;
|
149
|
+
this.top = 0;
|
150
|
+
this.stack = new int[4];
|
151
|
+
this.cs = java_lexer_start;
|
152
|
+
|
153
|
+
return context.nil;
|
154
|
+
}
|
155
|
+
|
156
|
+
/**
|
157
|
+
* Calls back in to Ruby land passing the current token value along.
|
158
|
+
*
|
159
|
+
* This method calls back in to Ruby land based on the method name
|
160
|
+
* specified in `name`. The Ruby callback should take one argument. This
|
161
|
+
* argument will be a String containing the value of the current token.
|
162
|
+
*/
|
163
|
+
public void callback(String name, byte[] data, Encoding enc, int ts, int te)
|
164
|
+
{
|
165
|
+
ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
|
166
|
+
|
167
|
+
RubyString value = this.runtime.newString(bytelist);
|
168
|
+
|
169
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
170
|
+
|
171
|
+
this.callMethod(context, name, value);
|
172
|
+
}
|
173
|
+
|
174
|
+
/**
|
175
|
+
* Calls back in to Ruby land without passing any arguments.
|
176
|
+
*/
|
177
|
+
public void callback_simple(String name)
|
178
|
+
{
|
179
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
180
|
+
|
181
|
+
this.callMethod(context, name);
|
182
|
+
}
|
183
|
+
|
184
|
+
/**
|
185
|
+
* Advances the line number by `amount` lines.
|
186
|
+
*/
|
187
|
+
public void advance_line(int amount)
|
188
|
+
{
|
189
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
190
|
+
RubyFixnum lines = this.runtime.newFixnum(amount);
|
191
|
+
|
192
|
+
this.callMethod(context, "advance_line", lines);
|
193
|
+
}
|
194
|
+
|
195
|
+
/**
|
196
|
+
* @see Oga::XML::Lexer#html_script?
|
197
|
+
*/
|
198
|
+
public Boolean html_script_p()
|
199
|
+
{
|
200
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
201
|
+
|
202
|
+
return this.callMethod(context, "html_script?").isTrue();
|
203
|
+
}
|
204
|
+
|
205
|
+
/**
|
206
|
+
* @see Oga::XML::Lexer#html_style?
|
207
|
+
*/
|
208
|
+
public Boolean html_style_p()
|
209
|
+
{
|
210
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
211
|
+
|
212
|
+
return this.callMethod(context, "html_style?").isTrue();
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
%%{
|
217
|
+
variable act this.act;
|
218
|
+
variable cs this.cs;
|
219
|
+
variable stack this.stack;
|
220
|
+
variable top this.top;
|
221
|
+
|
222
|
+
include base_lexer "base_lexer.rl";
|
223
|
+
}%%
|
@@ -0,0 +1,633 @@
|
|
1
|
+
%%machine base_lexer;
|
2
|
+
|
3
|
+
%%{
|
4
|
+
##
|
5
|
+
# Base grammar for the XML lexer.
|
6
|
+
#
|
7
|
+
# This grammar is shared between the C and Java extensions. As a result of
|
8
|
+
# this you should **not** include language specific code in Ragel
|
9
|
+
# actions/callbacks.
|
10
|
+
#
|
11
|
+
# To call back in to Ruby you can use one of the following two functions:
|
12
|
+
#
|
13
|
+
# * callback
|
14
|
+
# * callback_simple
|
15
|
+
#
|
16
|
+
# The first function takes 5 arguments:
|
17
|
+
#
|
18
|
+
# * The name of the Ruby method to call.
|
19
|
+
# * The input data.
|
20
|
+
# * The encoding of the input data.
|
21
|
+
# * The start of the current buffer.
|
22
|
+
# * The end of the current buffer.
|
23
|
+
#
|
24
|
+
# The function callback_simple only takes one argument: the name of the
|
25
|
+
# method to call. This function should be used for callbacks that don't
|
26
|
+
# require any values.
|
27
|
+
#
|
28
|
+
# When you call a method in Ruby make sure that said method is defined as
|
29
|
+
# an instance method in the `Oga::XML::Lexer` class.
|
30
|
+
#
|
31
|
+
# The name of the callback to invoke should be an identifier starting with
|
32
|
+
# "id_". The identifier should be defined in the associated C and Java code.
|
33
|
+
# In case of C code its value should be a Symbol as a ID object, for Java
|
34
|
+
# it should be a String. For example:
|
35
|
+
#
|
36
|
+
# ID id_foo = rb_intern("foo");
|
37
|
+
#
|
38
|
+
# And for Java:
|
39
|
+
#
|
40
|
+
# String id_foo = "foo";
|
41
|
+
#
|
42
|
+
# ## Machine Transitions
|
43
|
+
#
|
44
|
+
# To transition from one machine to another always use `fnext` instead of
|
45
|
+
# `fcall` and `fret`. This removes the need for the code to keep track of a
|
46
|
+
# stack.
|
47
|
+
#
|
48
|
+
|
49
|
+
newline = '\r\n' | '\n' | '\r';
|
50
|
+
whitespace = [ \t];
|
51
|
+
ident_char = [a-zA-Z0-9\-_\.];
|
52
|
+
identifier = ident_char+;
|
53
|
+
|
54
|
+
whitespace_or_newline = whitespace | newline;
|
55
|
+
|
56
|
+
action count_newlines {
|
57
|
+
if ( fc == '\n' ) lines++;
|
58
|
+
}
|
59
|
+
|
60
|
+
action advance_newline {
|
61
|
+
advance_line(1);
|
62
|
+
}
|
63
|
+
|
64
|
+
action hold_and_return {
|
65
|
+
fhold;
|
66
|
+
fret;
|
67
|
+
}
|
68
|
+
|
69
|
+
# Comments
|
70
|
+
#
|
71
|
+
# http://www.w3.org/TR/html/syntax.html#comments
|
72
|
+
#
|
73
|
+
# Unlike the W3C specification these rules *do* allow character sequences
|
74
|
+
# such as `--` and `->`. Putting extra checks in for these sequences would
|
75
|
+
# actually make the rules/actions more complex.
|
76
|
+
#
|
77
|
+
|
78
|
+
comment_start = '<!--';
|
79
|
+
comment_end = '-->';
|
80
|
+
|
81
|
+
# Everything except "-" OR a single "-"
|
82
|
+
comment_allowed = (^'-'+ | '-') $count_newlines;
|
83
|
+
|
84
|
+
action start_comment {
|
85
|
+
callback_simple(id_on_comment_start);
|
86
|
+
|
87
|
+
fnext comment_body;
|
88
|
+
}
|
89
|
+
|
90
|
+
comment_body := |*
|
91
|
+
comment_allowed => {
|
92
|
+
callback(id_on_comment_body, data, encoding, ts, te);
|
93
|
+
|
94
|
+
if ( lines > 0 )
|
95
|
+
{
|
96
|
+
advance_line(lines);
|
97
|
+
|
98
|
+
lines = 0;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
comment_end => {
|
103
|
+
callback_simple(id_on_comment_end);
|
104
|
+
|
105
|
+
fnext main;
|
106
|
+
};
|
107
|
+
*|;
|
108
|
+
|
109
|
+
# CDATA
|
110
|
+
#
|
111
|
+
# http://www.w3.org/TR/html/syntax.html#cdata-sections
|
112
|
+
#
|
113
|
+
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
114
|
+
# support them but treats their contents as plain text.
|
115
|
+
#
|
116
|
+
|
117
|
+
cdata_start = '<![CDATA[';
|
118
|
+
cdata_end = ']]>';
|
119
|
+
|
120
|
+
# Everything except "]" OR a single "]"
|
121
|
+
cdata_allowed = (^']'+ | ']') $count_newlines;
|
122
|
+
|
123
|
+
action start_cdata {
|
124
|
+
callback_simple(id_on_cdata_start);
|
125
|
+
|
126
|
+
fnext cdata_body;
|
127
|
+
}
|
128
|
+
|
129
|
+
cdata_body := |*
|
130
|
+
cdata_allowed => {
|
131
|
+
callback(id_on_cdata_body, data, encoding, ts, te);
|
132
|
+
|
133
|
+
if ( lines > 0 )
|
134
|
+
{
|
135
|
+
advance_line(lines);
|
136
|
+
|
137
|
+
lines = 0;
|
138
|
+
}
|
139
|
+
};
|
140
|
+
|
141
|
+
cdata_end => {
|
142
|
+
callback_simple(id_on_cdata_end);
|
143
|
+
|
144
|
+
fnext main;
|
145
|
+
};
|
146
|
+
*|;
|
147
|
+
|
148
|
+
# Processing Instructions
|
149
|
+
#
|
150
|
+
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
151
|
+
# http://en.wikipedia.org/wiki/Processing_Instruction
|
152
|
+
#
|
153
|
+
# These are tags meant to be used by parsers/libraries for custom behaviour.
|
154
|
+
# One example are the tags used by PHP: <?php and ?>. Note that the XML
|
155
|
+
# declaration tags (<?xml ?>) are not considered to be a processing
|
156
|
+
# instruction.
|
157
|
+
#
|
158
|
+
|
159
|
+
proc_ins_start = '<?' identifier;
|
160
|
+
proc_ins_end = '?>';
|
161
|
+
|
162
|
+
# Everything except "?" OR a single "?"
|
163
|
+
proc_ins_allowed = (^'?'+ | '?') $count_newlines;
|
164
|
+
|
165
|
+
action start_proc_ins {
|
166
|
+
callback_simple(id_on_proc_ins_start);
|
167
|
+
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
168
|
+
|
169
|
+
fnext proc_ins_body;
|
170
|
+
}
|
171
|
+
|
172
|
+
proc_ins_body := |*
|
173
|
+
proc_ins_allowed => {
|
174
|
+
callback(id_on_proc_ins_body, data, encoding, ts, te);
|
175
|
+
|
176
|
+
if ( lines > 0 )
|
177
|
+
{
|
178
|
+
advance_line(lines);
|
179
|
+
|
180
|
+
lines = 0;
|
181
|
+
}
|
182
|
+
};
|
183
|
+
|
184
|
+
proc_ins_end => {
|
185
|
+
callback_simple(id_on_proc_ins_end);
|
186
|
+
|
187
|
+
fnext main;
|
188
|
+
};
|
189
|
+
*|;
|
190
|
+
|
191
|
+
# Strings
|
192
|
+
#
|
193
|
+
# Strings in HTML can either be single or double quoted. If a string
|
194
|
+
# starts with one of these quotes it must be closed with the same type
|
195
|
+
# of quote.
|
196
|
+
#
|
197
|
+
dquote = '"';
|
198
|
+
squote = "'";
|
199
|
+
|
200
|
+
action emit_string {
|
201
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
202
|
+
|
203
|
+
if ( lines > 0 )
|
204
|
+
{
|
205
|
+
advance_line(lines);
|
206
|
+
|
207
|
+
lines = 0;
|
208
|
+
}
|
209
|
+
}
|
210
|
+
|
211
|
+
action start_string_squote {
|
212
|
+
callback_simple(id_on_string_squote);
|
213
|
+
|
214
|
+
fcall string_squote;
|
215
|
+
}
|
216
|
+
|
217
|
+
action start_string_dquote {
|
218
|
+
callback_simple(id_on_string_dquote);
|
219
|
+
|
220
|
+
fcall string_dquote;
|
221
|
+
}
|
222
|
+
|
223
|
+
string_squote := |*
|
224
|
+
^squote* $count_newlines => emit_string;
|
225
|
+
|
226
|
+
squote => {
|
227
|
+
callback_simple(id_on_string_squote);
|
228
|
+
|
229
|
+
fret;
|
230
|
+
};
|
231
|
+
*|;
|
232
|
+
|
233
|
+
string_dquote := |*
|
234
|
+
^dquote* $count_newlines => emit_string;
|
235
|
+
|
236
|
+
dquote => {
|
237
|
+
callback_simple(id_on_string_dquote);
|
238
|
+
|
239
|
+
fret;
|
240
|
+
};
|
241
|
+
*|;
|
242
|
+
|
243
|
+
# DOCTYPES
|
244
|
+
#
|
245
|
+
# http://www.w3.org/TR/html/syntax.html#the-doctype
|
246
|
+
#
|
247
|
+
# These rules support the 3 flavours of doctypes:
|
248
|
+
#
|
249
|
+
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
250
|
+
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
251
|
+
# 3. Legacy doctypes
|
252
|
+
#
|
253
|
+
doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
|
254
|
+
|
255
|
+
action start_doctype {
|
256
|
+
callback_simple(id_on_doctype_start);
|
257
|
+
|
258
|
+
if ( lines > 0 )
|
259
|
+
{
|
260
|
+
advance_line(lines);
|
261
|
+
|
262
|
+
lines = 0;
|
263
|
+
}
|
264
|
+
|
265
|
+
fnext doctype;
|
266
|
+
}
|
267
|
+
|
268
|
+
# Machine for processing inline rules of a doctype.
|
269
|
+
doctype_inline := |*
|
270
|
+
^']'* $count_newlines => {
|
271
|
+
callback(id_on_doctype_inline, data, encoding, ts, te);
|
272
|
+
|
273
|
+
if ( lines > 0 )
|
274
|
+
{
|
275
|
+
advance_line(lines);
|
276
|
+
|
277
|
+
lines = 0;
|
278
|
+
}
|
279
|
+
};
|
280
|
+
|
281
|
+
']' => { fnext doctype; };
|
282
|
+
*|;
|
283
|
+
|
284
|
+
# Machine for processing doctypes. Doctype values such as the public
|
285
|
+
# and system IDs are treated as T_STRING tokens.
|
286
|
+
doctype := |*
|
287
|
+
'PUBLIC' | 'SYSTEM' => {
|
288
|
+
callback(id_on_doctype_type, data, encoding, ts, te);
|
289
|
+
};
|
290
|
+
|
291
|
+
# Starts a set of inline doctype rules.
|
292
|
+
'[' => { fnext doctype_inline; };
|
293
|
+
|
294
|
+
# Lex the public/system IDs as regular strings.
|
295
|
+
squote => start_string_squote;
|
296
|
+
dquote => start_string_dquote;
|
297
|
+
|
298
|
+
identifier => {
|
299
|
+
callback(id_on_doctype_name, data, encoding, ts, te);
|
300
|
+
};
|
301
|
+
|
302
|
+
'>' => {
|
303
|
+
callback_simple(id_on_doctype_end);
|
304
|
+
fnext main;
|
305
|
+
};
|
306
|
+
|
307
|
+
newline => advance_newline;
|
308
|
+
|
309
|
+
whitespace;
|
310
|
+
*|;
|
311
|
+
|
312
|
+
# XML declaration tags
|
313
|
+
#
|
314
|
+
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
315
|
+
#
|
316
|
+
xml_decl_start = '<?xml';
|
317
|
+
xml_decl_end = '?>';
|
318
|
+
|
319
|
+
action start_xml_decl {
|
320
|
+
callback_simple(id_on_xml_decl_start);
|
321
|
+
fnext xml_decl;
|
322
|
+
}
|
323
|
+
|
324
|
+
# Machine that processes the contents of an XML declaration tag.
|
325
|
+
xml_decl := |*
|
326
|
+
xml_decl_end => {
|
327
|
+
if ( lines > 0 )
|
328
|
+
{
|
329
|
+
advance_line(lines);
|
330
|
+
|
331
|
+
lines = 0;
|
332
|
+
}
|
333
|
+
|
334
|
+
callback_simple(id_on_xml_decl_end);
|
335
|
+
|
336
|
+
fnext main;
|
337
|
+
};
|
338
|
+
|
339
|
+
# Attributes and their values (e.g. version="1.0").
|
340
|
+
identifier => {
|
341
|
+
if ( lines > 0 )
|
342
|
+
{
|
343
|
+
advance_line(lines);
|
344
|
+
|
345
|
+
lines = 0;
|
346
|
+
}
|
347
|
+
|
348
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
349
|
+
};
|
350
|
+
|
351
|
+
squote => start_string_squote;
|
352
|
+
dquote => start_string_dquote;
|
353
|
+
|
354
|
+
any $count_newlines;
|
355
|
+
*|;
|
356
|
+
|
357
|
+
# Elements
|
358
|
+
#
|
359
|
+
# http://www.w3.org/TR/html/syntax.html#syntax-elements
|
360
|
+
#
|
361
|
+
# Lexing of elements is broken up into different machines that handle the
|
362
|
+
# name/namespace, contents of the open tag and the body of an element. The
|
363
|
+
# body of an element is lexed using the `main` machine.
|
364
|
+
#
|
365
|
+
|
366
|
+
action start_element {
|
367
|
+
fhold;
|
368
|
+
fnext element_name;
|
369
|
+
}
|
370
|
+
|
371
|
+
action start_close_element {
|
372
|
+
fnext element_close;
|
373
|
+
}
|
374
|
+
|
375
|
+
action close_element {
|
376
|
+
callback(id_on_element_end, data, encoding, ts, te);
|
377
|
+
}
|
378
|
+
|
379
|
+
action close_element_fnext_main {
|
380
|
+
callback_simple(id_on_element_end);
|
381
|
+
|
382
|
+
fnext main;
|
383
|
+
}
|
384
|
+
|
385
|
+
element_start = '<' ident_char;
|
386
|
+
element_end = '</';
|
387
|
+
|
388
|
+
# Machine used for lexing the name/namespace of an element.
|
389
|
+
element_name := |*
|
390
|
+
identifier ':' => {
|
391
|
+
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
392
|
+
};
|
393
|
+
|
394
|
+
identifier => {
|
395
|
+
callback(id_on_element_name, data, encoding, ts, te);
|
396
|
+
fnext element_head;
|
397
|
+
};
|
398
|
+
*|;
|
399
|
+
|
400
|
+
# Machine used for lexing the closing tag of an element
|
401
|
+
element_close := |*
|
402
|
+
# namespace prefixes, currently not used but allows the rule below it
|
403
|
+
# to be used for the actual element name.
|
404
|
+
identifier ':';
|
405
|
+
|
406
|
+
identifier => close_element;
|
407
|
+
|
408
|
+
'>' => {
|
409
|
+
if ( lines > 0 )
|
410
|
+
{
|
411
|
+
advance_line(lines);
|
412
|
+
|
413
|
+
lines = 0;
|
414
|
+
}
|
415
|
+
|
416
|
+
fnext main;
|
417
|
+
};
|
418
|
+
|
419
|
+
any $count_newlines;
|
420
|
+
*|;
|
421
|
+
|
422
|
+
# Characters that can be used for unquoted HTML attribute values.
|
423
|
+
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
424
|
+
# for more info.
|
425
|
+
html_unquoted_value =
|
426
|
+
^(squote | dquote | whitespace_or_newline)
|
427
|
+
^('`' | '=' | '<' | '>' | whitespace_or_newline)+;
|
428
|
+
|
429
|
+
# Machine used after matching the "=" of an attribute and just before moving
|
430
|
+
# into the actual attribute value.
|
431
|
+
attribute_pre := |*
|
432
|
+
whitespace_or_newline $count_newlines;
|
433
|
+
|
434
|
+
any => {
|
435
|
+
fhold;
|
436
|
+
|
437
|
+
if ( lines > 0 )
|
438
|
+
{
|
439
|
+
advance_line(lines);
|
440
|
+
|
441
|
+
lines = 0;
|
442
|
+
}
|
443
|
+
|
444
|
+
if ( html_p )
|
445
|
+
{
|
446
|
+
fnext html_attribute_value;
|
447
|
+
}
|
448
|
+
else
|
449
|
+
{
|
450
|
+
fnext xml_attribute_value;
|
451
|
+
}
|
452
|
+
};
|
453
|
+
*|;
|
454
|
+
|
455
|
+
# Machine used for processing HTML attribute values.
|
456
|
+
html_attribute_value := |*
|
457
|
+
squote | dquote => {
|
458
|
+
fhold;
|
459
|
+
fnext xml_attribute_value;
|
460
|
+
};
|
461
|
+
|
462
|
+
# Unquoted attribute values are lexed as if they were single quoted
|
463
|
+
# strings.
|
464
|
+
html_unquoted_value => {
|
465
|
+
callback_simple(id_on_string_squote);
|
466
|
+
|
467
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
468
|
+
|
469
|
+
callback_simple(id_on_string_squote);
|
470
|
+
};
|
471
|
+
|
472
|
+
any => hold_and_return;
|
473
|
+
*|;
|
474
|
+
|
475
|
+
# Machine used for processing XML attribute values.
|
476
|
+
xml_attribute_value := |*
|
477
|
+
# The following two actions use "fnext" instead of "fcall". Combined
|
478
|
+
# with "element_head" using "fcall" to jump to this machine this means
|
479
|
+
# we can return back to "element_head" after processing a single string.
|
480
|
+
squote => {
|
481
|
+
callback_simple(id_on_string_squote);
|
482
|
+
|
483
|
+
fnext string_squote;
|
484
|
+
};
|
485
|
+
|
486
|
+
dquote => {
|
487
|
+
callback_simple(id_on_string_dquote);
|
488
|
+
|
489
|
+
fnext string_dquote;
|
490
|
+
};
|
491
|
+
|
492
|
+
any => hold_and_return;
|
493
|
+
*|;
|
494
|
+
|
495
|
+
# Machine used for processing the contents of an element's starting tag.
|
496
|
+
# This includes the name, namespace and attributes.
|
497
|
+
element_head := |*
|
498
|
+
newline => advance_newline;
|
499
|
+
|
500
|
+
# Attribute names and namespaces.
|
501
|
+
identifier ':' => {
|
502
|
+
callback(id_on_attribute_ns, data, encoding, ts, te - 1);
|
503
|
+
};
|
504
|
+
|
505
|
+
identifier => {
|
506
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
507
|
+
};
|
508
|
+
|
509
|
+
# Attribute values.
|
510
|
+
'=' => {
|
511
|
+
fcall attribute_pre;
|
512
|
+
};
|
513
|
+
|
514
|
+
# We're done with the open tag of the element.
|
515
|
+
'>' => {
|
516
|
+
callback_simple(id_on_element_open_end);
|
517
|
+
|
518
|
+
if ( html_script_p() )
|
519
|
+
{
|
520
|
+
fnext html_script;
|
521
|
+
}
|
522
|
+
else if ( html_style_p() )
|
523
|
+
{
|
524
|
+
fnext html_style;
|
525
|
+
}
|
526
|
+
else
|
527
|
+
{
|
528
|
+
fnext main;
|
529
|
+
}
|
530
|
+
};
|
531
|
+
|
532
|
+
# Self closing tags.
|
533
|
+
'/>' => {
|
534
|
+
callback_simple(id_on_element_end);
|
535
|
+
fnext main;
|
536
|
+
};
|
537
|
+
|
538
|
+
any;
|
539
|
+
*|;
|
540
|
+
|
541
|
+
# Text
|
542
|
+
#
|
543
|
+
# http://www.w3.org/TR/xml/#syntax
|
544
|
+
# http://www.w3.org/TR/html/syntax.html#text
|
545
|
+
#
|
546
|
+
# Text content is everything leading up to certain special tags such as "</"
|
547
|
+
# and "<?".
|
548
|
+
|
549
|
+
action start_text {
|
550
|
+
fhold;
|
551
|
+
fnext text;
|
552
|
+
}
|
553
|
+
|
554
|
+
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
|
555
|
+
# back to the main machine.
|
556
|
+
#
|
557
|
+
# Note that this only works if each sequence is exactly 2 characters
|
558
|
+
# long. Because of this "<!" is used instead of "<!--".
|
559
|
+
|
560
|
+
terminate_text = '</' | '<!' | '<?' | element_start;
|
561
|
+
allowed_text = (any* -- terminate_text) $count_newlines;
|
562
|
+
|
563
|
+
action emit_text {
|
564
|
+
callback(id_on_text, data, encoding, ts, te);
|
565
|
+
|
566
|
+
if ( lines > 0 )
|
567
|
+
{
|
568
|
+
advance_line(lines);
|
569
|
+
|
570
|
+
lines = 0;
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
text := |*
|
575
|
+
terminate_text | allowed_text => {
|
576
|
+
callback(id_on_text, data, encoding, ts, te);
|
577
|
+
|
578
|
+
if ( lines > 0 )
|
579
|
+
{
|
580
|
+
advance_line(lines);
|
581
|
+
|
582
|
+
lines = 0;
|
583
|
+
}
|
584
|
+
|
585
|
+
fnext main;
|
586
|
+
};
|
587
|
+
|
588
|
+
# Text followed by a special tag, such as "foo<!--"
|
589
|
+
allowed_text %{ mark = p; } terminate_text => {
|
590
|
+
callback(id_on_text, data, encoding, ts, mark);
|
591
|
+
|
592
|
+
p = mark - 1;
|
593
|
+
mark = 0;
|
594
|
+
|
595
|
+
if ( lines > 0 )
|
596
|
+
{
|
597
|
+
advance_line(lines);
|
598
|
+
|
599
|
+
lines = 0;
|
600
|
+
}
|
601
|
+
|
602
|
+
fnext main;
|
603
|
+
};
|
604
|
+
*|;
|
605
|
+
|
606
|
+
# Certain tags in HTML can contain basically anything except for the literal
|
607
|
+
# closing tag. Two examples are script and style tags. As a result of this
|
608
|
+
# we can't use the regular text machine.
|
609
|
+
|
610
|
+
literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
|
611
|
+
|
612
|
+
html_script := |*
|
613
|
+
literal_html_allowed => emit_text;
|
614
|
+
'</script>' => close_element_fnext_main;
|
615
|
+
*|;
|
616
|
+
|
617
|
+
html_style := |*
|
618
|
+
literal_html_allowed => emit_text;
|
619
|
+
'</style>' => close_element_fnext_main;
|
620
|
+
*|;
|
621
|
+
|
622
|
+
# The main machine aka the entry point of Ragel.
|
623
|
+
main := |*
|
624
|
+
doctype_start => start_doctype;
|
625
|
+
xml_decl_start => start_xml_decl;
|
626
|
+
comment_start => start_comment;
|
627
|
+
cdata_start => start_cdata;
|
628
|
+
proc_ins_start => start_proc_ins;
|
629
|
+
element_start => start_element;
|
630
|
+
element_end => start_close_element;
|
631
|
+
any => start_text;
|
632
|
+
*|;
|
633
|
+
}%%
|