oga 0.2.0-java → 0.2.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,6 +101,31 @@ public class Lexer extends RubyObject
101
101
  int pe = data.length;
102
102
  int eof = data.length;
103
103
 
104
+ String id_advance_line = "advance_line";
105
+ String id_on_attribute = "on_attribute";
106
+ String id_on_attribute_ns = "on_attribute_ns";
107
+ String id_on_cdata = "on_cdata";
108
+ String id_on_comment = "on_comment";
109
+ String id_on_doctype_end = "on_doctype_end";
110
+ String id_on_doctype_inline = "on_doctype_inline";
111
+ String id_on_doctype_name = "on_doctype_name";
112
+ String id_on_doctype_start = "on_doctype_start";
113
+ String id_on_doctype_type = "on_doctype_type";
114
+ String id_on_element_end = "on_element_end";
115
+ String id_on_element_name = "on_element_name";
116
+ String id_on_element_ns = "on_element_ns";
117
+ String id_on_element_open_end = "on_element_open_end";
118
+ String id_on_element_start = "on_element_start";
119
+ String id_on_proc_ins_end = "on_proc_ins_end";
120
+ String id_on_proc_ins_name = "on_proc_ins_name";
121
+ String id_on_proc_ins_start = "on_proc_ins_start";
122
+ String id_on_string_body = "on_string_body";
123
+ String id_on_string_dquote = "on_string_dquote";
124
+ String id_on_string_squote = "on_string_squote";
125
+ String id_on_text = "on_text";
126
+ String id_on_xml_decl_end = "on_xml_decl_end";
127
+ String id_on_xml_decl_start = "on_xml_decl_start";
128
+
104
129
  %% write exec;
105
130
 
106
131
  this.lines = lines;
@@ -160,6 +185,17 @@ public class Lexer extends RubyObject
160
185
 
161
186
  this.callMethod(context, "advance_line", lines);
162
187
  }
188
+
189
+ /**
190
+ * Returns true if we're in an HTML script tag. See
191
+ * Oga::XML::Lexer#inside_html_script? for more information.
192
+ */
193
+ public Boolean inside_html_script_p()
194
+ {
195
+ ThreadContext context = this.runtime.getCurrentContext();
196
+
197
+ return this.callMethod(context, "inside_html_script?").isTrue();
198
+ }
163
199
  }
164
200
 
165
201
  %%{
@@ -28,6 +28,17 @@
28
28
  # When you call a method in Ruby make sure that said method is defined as
29
29
  # an instance method in the `Oga::XML::Lexer` class.
30
30
  #
31
+ # The name of the callback to invoke should be an identifier starting with
32
+ # "id_". The identifier should be defined in the associated C and Java code.
33
+ # In case of C code its value should be a Symbol as a ID object, for Java
34
+ # it should be a String. For example:
35
+ #
36
+ # ID id_foo = rb_intern("foo");
37
+ #
38
+ # And for Java:
39
+ #
40
+ # String id_foo = "foo";
41
+ #
31
42
  # ## Machine Transitions
32
43
  #
33
44
  # To transition from one machine to another always use `fnext` instead of
@@ -59,7 +70,7 @@
59
70
  comment = comment_start (any* -- comment_end) comment_end;
60
71
 
61
72
  action start_comment {
62
- callback("on_comment", data, encoding, ts + 4, te - 3);
73
+ callback(id_on_comment, data, encoding, ts + 4, te - 3);
63
74
  }
64
75
 
65
76
  # CDATA
@@ -75,7 +86,7 @@
75
86
  cdata = cdata_start (any* -- cdata_end) cdata_end;
76
87
 
77
88
  action start_cdata {
78
- callback("on_cdata", data, encoding, ts + 9, te - 3);
89
+ callback(id_on_cdata, data, encoding, ts + 9, te - 3);
79
90
  }
80
91
 
81
92
  # Processing Instructions
@@ -93,8 +104,8 @@
93
104
  proc_ins_end = '?>';
94
105
 
95
106
  action start_proc_ins {
96
- callback_simple("on_proc_ins_start");
97
- callback("on_proc_ins_name", data, encoding, ts + 2, te);
107
+ callback_simple(id_on_proc_ins_start);
108
+ callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
98
109
 
99
110
  mark = te;
100
111
 
@@ -103,8 +114,8 @@
103
114
 
104
115
  proc_ins_body := |*
105
116
  proc_ins_end => {
106
- callback("on_text", data, encoding, mark, ts);
107
- callback_simple("on_proc_ins_end");
117
+ callback(id_on_text, data, encoding, mark, ts);
118
+ callback_simple(id_on_proc_ins_end);
108
119
 
109
120
  mark = 0;
110
121
 
@@ -124,7 +135,7 @@
124
135
  squote = "'";
125
136
 
126
137
  action emit_string {
127
- callback("on_string_body", data, encoding, ts, te);
138
+ callback(id_on_string_body, data, encoding, ts, te);
128
139
 
129
140
  if ( lines > 0 )
130
141
  {
@@ -135,13 +146,13 @@
135
146
  }
136
147
 
137
148
  action start_string_squote {
138
- callback_simple("on_string_squote");
149
+ callback_simple(id_on_string_squote);
139
150
 
140
151
  fcall string_squote;
141
152
  }
142
153
 
143
154
  action start_string_dquote {
144
- callback_simple("on_string_dquote");
155
+ callback_simple(id_on_string_dquote);
145
156
 
146
157
  fcall string_dquote;
147
158
  }
@@ -150,7 +161,7 @@
150
161
  ^squote* $count_newlines => emit_string;
151
162
 
152
163
  squote => {
153
- callback_simple("on_string_squote");
164
+ callback_simple(id_on_string_squote);
154
165
 
155
166
  fret;
156
167
  };
@@ -160,7 +171,7 @@
160
171
  ^dquote* $count_newlines => emit_string;
161
172
 
162
173
  dquote => {
163
- callback_simple("on_string_dquote");
174
+ callback_simple(id_on_string_dquote);
164
175
 
165
176
  fret;
166
177
  };
@@ -179,22 +190,35 @@
179
190
  doctype_start = '<!DOCTYPE'i whitespace+;
180
191
 
181
192
  action start_doctype {
182
- callback_simple("on_doctype_start");
193
+ callback_simple(id_on_doctype_start);
183
194
  fnext doctype;
184
195
  }
185
196
 
197
+ # Machine for processing inline rules of a doctype.
198
+ doctype_inline := |*
199
+ ^']'* $count_newlines => {
200
+ callback(id_on_doctype_inline, data, encoding, ts, te);
201
+
202
+ if ( lines > 0 )
203
+ {
204
+ advance_line(lines);
205
+
206
+ lines = 0;
207
+ }
208
+ };
209
+
210
+ ']' => { fnext doctype; };
211
+ *|;
212
+
186
213
  # Machine for processing doctypes. Doctype values such as the public
187
214
  # and system IDs are treated as T_STRING tokens.
188
215
  doctype := |*
189
216
  'PUBLIC' | 'SYSTEM' => {
190
- callback("on_doctype_type", data, encoding, ts, te);
217
+ callback(id_on_doctype_type, data, encoding, ts, te);
191
218
  };
192
219
 
193
- # Consumes everything between the [ and ]. Due to the use of :> the ]
194
- # is not consumed by any+.
195
- '[' any+ :> ']' => {
196
- callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
197
- };
220
+ # Starts a set of inline doctype rules.
221
+ '[' => { fnext doctype_inline; };
198
222
 
199
223
  # Lex the public/system IDs as regular strings.
200
224
  squote => start_string_squote;
@@ -205,11 +229,11 @@
205
229
  whitespace;
206
230
 
207
231
  identifier => {
208
- callback("on_doctype_name", data, encoding, ts, te);
232
+ callback(id_on_doctype_name, data, encoding, ts, te);
209
233
  };
210
234
 
211
235
  '>' => {
212
- callback_simple("on_doctype_end");
236
+ callback_simple(id_on_doctype_end);
213
237
  fnext main;
214
238
  };
215
239
  *|;
@@ -222,20 +246,20 @@
222
246
  xml_decl_end = '?>';
223
247
 
224
248
  action start_xml_decl {
225
- callback_simple("on_xml_decl_start");
249
+ callback_simple(id_on_xml_decl_start);
226
250
  fnext xml_decl;
227
251
  }
228
252
 
229
253
  # Machine that processes the contents of an XML declaration tag.
230
254
  xml_decl := |*
231
255
  xml_decl_end => {
232
- callback_simple("on_xml_decl_end");
256
+ callback_simple(id_on_xml_decl_end);
233
257
  fnext main;
234
258
  };
235
259
 
236
260
  # Attributes and their values (e.g. version="1.0").
237
261
  identifier => {
238
- callback("on_attribute", data, encoding, ts, te);
262
+ callback(id_on_attribute, data, encoding, ts, te);
239
263
  };
240
264
 
241
265
  squote => start_string_squote;
@@ -257,23 +281,23 @@
257
281
  element_end = '</' identifier (':' identifier)* '>';
258
282
 
259
283
  action start_element {
260
- callback_simple("on_element_start");
284
+ callback_simple(id_on_element_start);
261
285
  fhold;
262
286
  fnext element_name;
263
287
  }
264
288
 
265
289
  action close_element {
266
- callback_simple("on_element_end");
290
+ callback_simple(id_on_element_end);
267
291
  }
268
292
 
269
293
  # Machine used for lexing the name/namespace of an element.
270
294
  element_name := |*
271
295
  identifier ':' => {
272
- callback("on_element_ns", data, encoding, ts, te - 1);
296
+ callback(id_on_element_ns, data, encoding, ts, te - 1);
273
297
  };
274
298
 
275
299
  identifier => {
276
- callback("on_element_name", data, encoding, ts, te);
300
+ callback(id_on_element_name, data, encoding, ts, te);
277
301
  fnext element_head;
278
302
  };
279
303
  *|;
@@ -284,16 +308,16 @@
284
308
  whitespace | '=';
285
309
 
286
310
  newline => {
287
- callback_simple("advance_line");
311
+ callback_simple(id_advance_line);
288
312
  };
289
313
 
290
314
  # Attribute names and namespaces.
291
315
  identifier ':' => {
292
- callback("on_attribute_ns", data, encoding, ts, te - 1);
316
+ callback(id_on_attribute_ns, data, encoding, ts, te - 1);
293
317
  };
294
318
 
295
319
  identifier => {
296
- callback("on_attribute", data, encoding, ts, te);
320
+ callback(id_on_attribute, data, encoding, ts, te);
297
321
  };
298
322
 
299
323
  # Attribute values.
@@ -302,13 +326,23 @@
302
326
 
303
327
  # We're done with the open tag of the element.
304
328
  '>' => {
305
- callback_simple("on_element_open_end");
306
- fnext main;
329
+ callback_simple(id_on_element_open_end);
330
+
331
+ if ( inside_html_script_p() )
332
+ {
333
+ mark = ts + 1;
334
+
335
+ fnext script_text;
336
+ }
337
+ else
338
+ {
339
+ fnext main;
340
+ }
307
341
  };
308
342
 
309
343
  # Self closing tags.
310
344
  '/>' => {
311
- callback_simple("on_element_end");
345
+ callback_simple(id_on_element_end);
312
346
  fnext main;
313
347
  };
314
348
  *|;
@@ -337,7 +371,7 @@
337
371
 
338
372
  text := |*
339
373
  terminate_text | allowed_text => {
340
- callback("on_text", data, encoding, ts, te);
374
+ callback(id_on_text, data, encoding, ts, te);
341
375
 
342
376
  if ( lines > 0 )
343
377
  {
@@ -351,7 +385,7 @@
351
385
 
352
386
  # Text followed by a special tag, such as "foo<!--"
353
387
  allowed_text %{ mark = p; } terminate_text => {
354
- callback("on_text", data, encoding, ts, mark);
388
+ callback(id_on_text, data, encoding, ts, mark);
355
389
 
356
390
  p = mark - 1;
357
391
  mark = 0;
@@ -367,6 +401,30 @@
367
401
  };
368
402
  *|;
369
403
 
404
+ # <script> tags in HTML can contain basically anything except for the
405
+ # literal "</script>". As a result of this we can't use the regular text
406
+ # machine.
407
+ script_text := |*
408
+ '</script>' => {
409
+ callback(id_on_text, data, encoding, mark, ts);
410
+
411
+ mark = 0;
412
+
413
+ if ( lines > 0 )
414
+ {
415
+ advance_line(lines);
416
+
417
+ lines = 0;
418
+ }
419
+
420
+ callback_simple(id_on_element_end);
421
+
422
+ fnext main;
423
+ };
424
+
425
+ any $count_newlines;
426
+ *|;
427
+
370
428
  # The main machine aka the entry point of Ragel.
371
429
  main := |*
372
430
  doctype_start => start_doctype;
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end # Oga
@@ -77,6 +77,13 @@ module Oga
77
77
  return xml
78
78
  end
79
79
 
80
+ ##
81
+ # @return [TrueClass|FalseClass]
82
+ #
83
+ def html?
84
+ return type == :html
85
+ end
86
+
80
87
  ##
81
88
  # Inspects the document and its child nodes. Child nodes are indented for
82
89
  # each nesting level.
@@ -212,11 +212,8 @@ module Oga
212
212
  # @param [String] text
213
213
  #
214
214
  def inner_text=(text)
215
- children.each do |child|
216
- child.remove if child.is_a?(Text)
217
- end
218
-
219
- children << XML::Text.new(:text => text)
215
+ text_node = XML::Text.new(:text => text)
216
+ @children = NodeSet.new([text_node])
220
217
  end
221
218
 
222
219
  ##
@@ -310,7 +307,7 @@ module Oga
310
307
  self_closing = children.empty?
311
308
  root = root_node
312
309
 
313
- if root.is_a?(Document) and root.type == :html \
310
+ if root.is_a?(Document) and root.html? \
314
311
  and !HTML_VOID_ELEMENTS.include?(name)
315
312
  self_closing = false
316
313
  end
@@ -10,9 +10,16 @@ module Oga
10
10
  # @return [Hash]
11
11
  #
12
12
  DECODE_MAPPING = {
13
- '&lt;' => '<',
14
- '&gt;' => '>',
15
- '&amp;' => '&'
13
+ '&lt;' => '<',
14
+ '&#60;' => '<',
15
+ '&gt;' => '>',
16
+ '&#62;' => '>',
17
+ '&apos;' => "'",
18
+ '&#39;' => "'",
19
+ '&quot;' => '"',
20
+ '&#34;' => '"',
21
+ '&amp;' => '&',
22
+ '&#38;' => '&',
16
23
  }
17
24
 
18
25
  ##
@@ -22,8 +29,10 @@ module Oga
22
29
  #
23
30
  ENCODE_MAPPING = {
24
31
  '&' => '&amp;',
32
+ '"' => '&quot;',
33
+ "'" => '&apos;',
25
34
  '>' => '&gt;',
26
- '<' => '&lt;'
35
+ '<' => '&lt;',
27
36
  }
28
37
 
29
38
  ##
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,6 +40,14 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
+ ##
44
+ # Element name used to determine if a tag being processed is a Javascript
45
+ # tag.
46
+ #
47
+ # @return [String]
48
+ #
49
+ SCRIPT_TAG = 'script'.freeze
50
+
43
51
  ##
44
52
  # @param [String|IO] data The data to lex. This can either be a String or
45
53
  # an IO instance.
@@ -181,6 +189,15 @@ module Oga
181
189
  return @elements.last
182
190
  end
183
191
 
192
+ ##
193
+ # Returns true if the current element is the HTML `<script>` element.
194
+ #
195
+ # @return [TrueClass|FalseClass]
196
+ #
197
+ def inside_html_script?
198
+ return html? && current_element == SCRIPT_TAG
199
+ end
200
+
184
201
  ##
185
202
  # Called when processing a single quote.
186
203
  #