oga 0.2.0-java → 0.2.1-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,6 +101,31 @@ public class Lexer extends RubyObject
101
101
  int pe = data.length;
102
102
  int eof = data.length;
103
103
 
104
+ String id_advance_line = "advance_line";
105
+ String id_on_attribute = "on_attribute";
106
+ String id_on_attribute_ns = "on_attribute_ns";
107
+ String id_on_cdata = "on_cdata";
108
+ String id_on_comment = "on_comment";
109
+ String id_on_doctype_end = "on_doctype_end";
110
+ String id_on_doctype_inline = "on_doctype_inline";
111
+ String id_on_doctype_name = "on_doctype_name";
112
+ String id_on_doctype_start = "on_doctype_start";
113
+ String id_on_doctype_type = "on_doctype_type";
114
+ String id_on_element_end = "on_element_end";
115
+ String id_on_element_name = "on_element_name";
116
+ String id_on_element_ns = "on_element_ns";
117
+ String id_on_element_open_end = "on_element_open_end";
118
+ String id_on_element_start = "on_element_start";
119
+ String id_on_proc_ins_end = "on_proc_ins_end";
120
+ String id_on_proc_ins_name = "on_proc_ins_name";
121
+ String id_on_proc_ins_start = "on_proc_ins_start";
122
+ String id_on_string_body = "on_string_body";
123
+ String id_on_string_dquote = "on_string_dquote";
124
+ String id_on_string_squote = "on_string_squote";
125
+ String id_on_text = "on_text";
126
+ String id_on_xml_decl_end = "on_xml_decl_end";
127
+ String id_on_xml_decl_start = "on_xml_decl_start";
128
+
104
129
  %% write exec;
105
130
 
106
131
  this.lines = lines;
@@ -160,6 +185,17 @@ public class Lexer extends RubyObject
160
185
 
161
186
  this.callMethod(context, "advance_line", lines);
162
187
  }
188
+
189
+ /**
190
+ * Returns true if we're in an HTML script tag. See
191
+ * Oga::XML::Lexer#inside_html_script? for more information.
192
+ */
193
+ public Boolean inside_html_script_p()
194
+ {
195
+ ThreadContext context = this.runtime.getCurrentContext();
196
+
197
+ return this.callMethod(context, "inside_html_script?").isTrue();
198
+ }
163
199
  }
164
200
 
165
201
  %%{
@@ -28,6 +28,17 @@
28
28
  # When you call a method in Ruby make sure that said method is defined as
29
29
  # an instance method in the `Oga::XML::Lexer` class.
30
30
  #
31
+ # The name of the callback to invoke should be an identifier starting with
32
+ # "id_". The identifier should be defined in the associated C and Java code.
33
+ # In case of C code its value should be a Symbol as a ID object, for Java
34
+ # it should be a String. For example:
35
+ #
36
+ # ID id_foo = rb_intern("foo");
37
+ #
38
+ # And for Java:
39
+ #
40
+ # String id_foo = "foo";
41
+ #
31
42
  # ## Machine Transitions
32
43
  #
33
44
  # To transition from one machine to another always use `fnext` instead of
@@ -59,7 +70,7 @@
59
70
  comment = comment_start (any* -- comment_end) comment_end;
60
71
 
61
72
  action start_comment {
62
- callback("on_comment", data, encoding, ts + 4, te - 3);
73
+ callback(id_on_comment, data, encoding, ts + 4, te - 3);
63
74
  }
64
75
 
65
76
  # CDATA
@@ -75,7 +86,7 @@
75
86
  cdata = cdata_start (any* -- cdata_end) cdata_end;
76
87
 
77
88
  action start_cdata {
78
- callback("on_cdata", data, encoding, ts + 9, te - 3);
89
+ callback(id_on_cdata, data, encoding, ts + 9, te - 3);
79
90
  }
80
91
 
81
92
  # Processing Instructions
@@ -93,8 +104,8 @@
93
104
  proc_ins_end = '?>';
94
105
 
95
106
  action start_proc_ins {
96
- callback_simple("on_proc_ins_start");
97
- callback("on_proc_ins_name", data, encoding, ts + 2, te);
107
+ callback_simple(id_on_proc_ins_start);
108
+ callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
98
109
 
99
110
  mark = te;
100
111
 
@@ -103,8 +114,8 @@
103
114
 
104
115
  proc_ins_body := |*
105
116
  proc_ins_end => {
106
- callback("on_text", data, encoding, mark, ts);
107
- callback_simple("on_proc_ins_end");
117
+ callback(id_on_text, data, encoding, mark, ts);
118
+ callback_simple(id_on_proc_ins_end);
108
119
 
109
120
  mark = 0;
110
121
 
@@ -124,7 +135,7 @@
124
135
  squote = "'";
125
136
 
126
137
  action emit_string {
127
- callback("on_string_body", data, encoding, ts, te);
138
+ callback(id_on_string_body, data, encoding, ts, te);
128
139
 
129
140
  if ( lines > 0 )
130
141
  {
@@ -135,13 +146,13 @@
135
146
  }
136
147
 
137
148
  action start_string_squote {
138
- callback_simple("on_string_squote");
149
+ callback_simple(id_on_string_squote);
139
150
 
140
151
  fcall string_squote;
141
152
  }
142
153
 
143
154
  action start_string_dquote {
144
- callback_simple("on_string_dquote");
155
+ callback_simple(id_on_string_dquote);
145
156
 
146
157
  fcall string_dquote;
147
158
  }
@@ -150,7 +161,7 @@
150
161
  ^squote* $count_newlines => emit_string;
151
162
 
152
163
  squote => {
153
- callback_simple("on_string_squote");
164
+ callback_simple(id_on_string_squote);
154
165
 
155
166
  fret;
156
167
  };
@@ -160,7 +171,7 @@
160
171
  ^dquote* $count_newlines => emit_string;
161
172
 
162
173
  dquote => {
163
- callback_simple("on_string_dquote");
174
+ callback_simple(id_on_string_dquote);
164
175
 
165
176
  fret;
166
177
  };
@@ -179,22 +190,35 @@
179
190
  doctype_start = '<!DOCTYPE'i whitespace+;
180
191
 
181
192
  action start_doctype {
182
- callback_simple("on_doctype_start");
193
+ callback_simple(id_on_doctype_start);
183
194
  fnext doctype;
184
195
  }
185
196
 
197
+ # Machine for processing inline rules of a doctype.
198
+ doctype_inline := |*
199
+ ^']'* $count_newlines => {
200
+ callback(id_on_doctype_inline, data, encoding, ts, te);
201
+
202
+ if ( lines > 0 )
203
+ {
204
+ advance_line(lines);
205
+
206
+ lines = 0;
207
+ }
208
+ };
209
+
210
+ ']' => { fnext doctype; };
211
+ *|;
212
+
186
213
  # Machine for processing doctypes. Doctype values such as the public
187
214
  # and system IDs are treated as T_STRING tokens.
188
215
  doctype := |*
189
216
  'PUBLIC' | 'SYSTEM' => {
190
- callback("on_doctype_type", data, encoding, ts, te);
217
+ callback(id_on_doctype_type, data, encoding, ts, te);
191
218
  };
192
219
 
193
- # Consumes everything between the [ and ]. Due to the use of :> the ]
194
- # is not consumed by any+.
195
- '[' any+ :> ']' => {
196
- callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
197
- };
220
+ # Starts a set of inline doctype rules.
221
+ '[' => { fnext doctype_inline; };
198
222
 
199
223
  # Lex the public/system IDs as regular strings.
200
224
  squote => start_string_squote;
@@ -205,11 +229,11 @@
205
229
  whitespace;
206
230
 
207
231
  identifier => {
208
- callback("on_doctype_name", data, encoding, ts, te);
232
+ callback(id_on_doctype_name, data, encoding, ts, te);
209
233
  };
210
234
 
211
235
  '>' => {
212
- callback_simple("on_doctype_end");
236
+ callback_simple(id_on_doctype_end);
213
237
  fnext main;
214
238
  };
215
239
  *|;
@@ -222,20 +246,20 @@
222
246
  xml_decl_end = '?>';
223
247
 
224
248
  action start_xml_decl {
225
- callback_simple("on_xml_decl_start");
249
+ callback_simple(id_on_xml_decl_start);
226
250
  fnext xml_decl;
227
251
  }
228
252
 
229
253
  # Machine that processes the contents of an XML declaration tag.
230
254
  xml_decl := |*
231
255
  xml_decl_end => {
232
- callback_simple("on_xml_decl_end");
256
+ callback_simple(id_on_xml_decl_end);
233
257
  fnext main;
234
258
  };
235
259
 
236
260
  # Attributes and their values (e.g. version="1.0").
237
261
  identifier => {
238
- callback("on_attribute", data, encoding, ts, te);
262
+ callback(id_on_attribute, data, encoding, ts, te);
239
263
  };
240
264
 
241
265
  squote => start_string_squote;
@@ -257,23 +281,23 @@
257
281
  element_end = '</' identifier (':' identifier)* '>';
258
282
 
259
283
  action start_element {
260
- callback_simple("on_element_start");
284
+ callback_simple(id_on_element_start);
261
285
  fhold;
262
286
  fnext element_name;
263
287
  }
264
288
 
265
289
  action close_element {
266
- callback_simple("on_element_end");
290
+ callback_simple(id_on_element_end);
267
291
  }
268
292
 
269
293
  # Machine used for lexing the name/namespace of an element.
270
294
  element_name := |*
271
295
  identifier ':' => {
272
- callback("on_element_ns", data, encoding, ts, te - 1);
296
+ callback(id_on_element_ns, data, encoding, ts, te - 1);
273
297
  };
274
298
 
275
299
  identifier => {
276
- callback("on_element_name", data, encoding, ts, te);
300
+ callback(id_on_element_name, data, encoding, ts, te);
277
301
  fnext element_head;
278
302
  };
279
303
  *|;
@@ -284,16 +308,16 @@
284
308
  whitespace | '=';
285
309
 
286
310
  newline => {
287
- callback_simple("advance_line");
311
+ callback_simple(id_advance_line);
288
312
  };
289
313
 
290
314
  # Attribute names and namespaces.
291
315
  identifier ':' => {
292
- callback("on_attribute_ns", data, encoding, ts, te - 1);
316
+ callback(id_on_attribute_ns, data, encoding, ts, te - 1);
293
317
  };
294
318
 
295
319
  identifier => {
296
- callback("on_attribute", data, encoding, ts, te);
320
+ callback(id_on_attribute, data, encoding, ts, te);
297
321
  };
298
322
 
299
323
  # Attribute values.
@@ -302,13 +326,23 @@
302
326
 
303
327
  # We're done with the open tag of the element.
304
328
  '>' => {
305
- callback_simple("on_element_open_end");
306
- fnext main;
329
+ callback_simple(id_on_element_open_end);
330
+
331
+ if ( inside_html_script_p() )
332
+ {
333
+ mark = ts + 1;
334
+
335
+ fnext script_text;
336
+ }
337
+ else
338
+ {
339
+ fnext main;
340
+ }
307
341
  };
308
342
 
309
343
  # Self closing tags.
310
344
  '/>' => {
311
- callback_simple("on_element_end");
345
+ callback_simple(id_on_element_end);
312
346
  fnext main;
313
347
  };
314
348
  *|;
@@ -337,7 +371,7 @@
337
371
 
338
372
  text := |*
339
373
  terminate_text | allowed_text => {
340
- callback("on_text", data, encoding, ts, te);
374
+ callback(id_on_text, data, encoding, ts, te);
341
375
 
342
376
  if ( lines > 0 )
343
377
  {
@@ -351,7 +385,7 @@
351
385
 
352
386
  # Text followed by a special tag, such as "foo<!--"
353
387
  allowed_text %{ mark = p; } terminate_text => {
354
- callback("on_text", data, encoding, ts, mark);
388
+ callback(id_on_text, data, encoding, ts, mark);
355
389
 
356
390
  p = mark - 1;
357
391
  mark = 0;
@@ -367,6 +401,30 @@
367
401
  };
368
402
  *|;
369
403
 
404
+ # <script> tags in HTML can contain basically anything except for the
405
+ # literal "</script>". As a result of this we can't use the regular text
406
+ # machine.
407
+ script_text := |*
408
+ '</script>' => {
409
+ callback(id_on_text, data, encoding, mark, ts);
410
+
411
+ mark = 0;
412
+
413
+ if ( lines > 0 )
414
+ {
415
+ advance_line(lines);
416
+
417
+ lines = 0;
418
+ }
419
+
420
+ callback_simple(id_on_element_end);
421
+
422
+ fnext main;
423
+ };
424
+
425
+ any $count_newlines;
426
+ *|;
427
+
370
428
  # The main machine aka the entry point of Ragel.
371
429
  main := |*
372
430
  doctype_start => start_doctype;
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end # Oga
@@ -77,6 +77,13 @@ module Oga
77
77
  return xml
78
78
  end
79
79
 
80
+ ##
81
+ # @return [TrueClass|FalseClass]
82
+ #
83
+ def html?
84
+ return type == :html
85
+ end
86
+
80
87
  ##
81
88
  # Inspects the document and its child nodes. Child nodes are indented for
82
89
  # each nesting level.
@@ -212,11 +212,8 @@ module Oga
212
212
  # @param [String] text
213
213
  #
214
214
  def inner_text=(text)
215
- children.each do |child|
216
- child.remove if child.is_a?(Text)
217
- end
218
-
219
- children << XML::Text.new(:text => text)
215
+ text_node = XML::Text.new(:text => text)
216
+ @children = NodeSet.new([text_node])
220
217
  end
221
218
 
222
219
  ##
@@ -310,7 +307,7 @@ module Oga
310
307
  self_closing = children.empty?
311
308
  root = root_node
312
309
 
313
- if root.is_a?(Document) and root.type == :html \
310
+ if root.is_a?(Document) and root.html? \
314
311
  and !HTML_VOID_ELEMENTS.include?(name)
315
312
  self_closing = false
316
313
  end
@@ -10,9 +10,16 @@ module Oga
10
10
  # @return [Hash]
11
11
  #
12
12
  DECODE_MAPPING = {
13
- '&lt;' => '<',
14
- '&gt;' => '>',
15
- '&amp;' => '&'
13
+ '&lt;' => '<',
14
+ '&#60;' => '<',
15
+ '&gt;' => '>',
16
+ '&#62;' => '>',
17
+ '&apos;' => "'",
18
+ '&#39;' => "'",
19
+ '&quot;' => '"',
20
+ '&#34;' => '"',
21
+ '&amp;' => '&',
22
+ '&#38;' => '&',
16
23
  }
17
24
 
18
25
  ##
@@ -22,8 +29,10 @@ module Oga
22
29
  #
23
30
  ENCODE_MAPPING = {
24
31
  '&' => '&amp;',
32
+ '"' => '&quot;',
33
+ "'" => '&apos;',
25
34
  '>' => '&gt;',
26
- '<' => '&lt;'
35
+ '<' => '&lt;',
27
36
  }
28
37
 
29
38
  ##
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,6 +40,14 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
+ ##
44
+ # Element name used to determine if a tag being processed is a Javascript
45
+ # tag.
46
+ #
47
+ # @return [String]
48
+ #
49
+ SCRIPT_TAG = 'script'.freeze
50
+
43
51
  ##
44
52
  # @param [String|IO] data The data to lex. This can either be a String or
45
53
  # an IO instance.
@@ -181,6 +189,15 @@ module Oga
181
189
  return @elements.last
182
190
  end
183
191
 
192
+ ##
193
+ # Returns true if the current element is the HTML `<script>` element.
194
+ #
195
+ # @return [TrueClass|FalseClass]
196
+ #
197
+ def inside_html_script?
198
+ return html? && current_element == SCRIPT_TAG
199
+ end
200
+
184
201
  ##
185
202
  # Called when processing a single quote.
186
203
  #