oga 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
89
89
  @JRubyMethod
90
90
  public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
91
91
  {
92
+ Boolean html_p = this.callMethod(context, "html").isTrue();
93
+
92
94
  Encoding encoding = rb_str.getEncoding();
93
95
 
94
96
  byte[] data = rb_str.getBytes();
@@ -104,8 +106,12 @@ public class Lexer extends RubyObject
104
106
  String id_advance_line = "advance_line";
105
107
  String id_on_attribute = "on_attribute";
106
108
  String id_on_attribute_ns = "on_attribute_ns";
107
- String id_on_cdata = "on_cdata";
108
- String id_on_comment = "on_comment";
109
+ String id_on_cdata_start = "on_cdata_start";
110
+ String id_on_cdata_body = "on_cdata_body";
111
+ String id_on_cdata_end = "on_cdata_end";
112
+ String id_on_comment_start = "on_comment_start";
113
+ String id_on_comment_body = "on_comment_body";
114
+ String id_on_comment_end = "on_comment_end";
109
115
  String id_on_doctype_end = "on_doctype_end";
110
116
  String id_on_doctype_inline = "on_doctype_inline";
111
117
  String id_on_doctype_name = "on_doctype_name";
@@ -119,6 +125,7 @@ public class Lexer extends RubyObject
119
125
  String id_on_proc_ins_end = "on_proc_ins_end";
120
126
  String id_on_proc_ins_name = "on_proc_ins_name";
121
127
  String id_on_proc_ins_start = "on_proc_ins_start";
128
+ String id_on_proc_ins_body = "on_proc_ins_body";
122
129
  String id_on_string_body = "on_string_body";
123
130
  String id_on_string_dquote = "on_string_dquote";
124
131
  String id_on_string_squote = "on_string_squote";
@@ -67,12 +67,35 @@
67
67
 
68
68
  comment_start = '<!--';
69
69
  comment_end = '-->';
70
- comment = comment_start (any* -- comment_end) comment_end;
70
+
71
+ # Everything except "-" OR a single "-"
72
+ comment_allowed = (^'-'+ | '-') $count_newlines;
71
73
 
72
74
  action start_comment {
73
- callback(id_on_comment, data, encoding, ts + 4, te - 3);
75
+ callback_simple(id_on_comment_start);
76
+
77
+ fnext comment_body;
74
78
  }
75
79
 
80
+ comment_body := |*
81
+ comment_allowed => {
82
+ callback(id_on_comment_body, data, encoding, ts, te);
83
+
84
+ if ( lines > 0 )
85
+ {
86
+ advance_line(lines);
87
+
88
+ lines = 0;
89
+ }
90
+ };
91
+
92
+ comment_end => {
93
+ callback_simple(id_on_comment_end);
94
+
95
+ fnext main;
96
+ };
97
+ *|;
98
+
76
99
  # CDATA
77
100
  #
78
101
  # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
@@ -83,12 +106,35 @@
83
106
 
84
107
  cdata_start = '<![CDATA[';
85
108
  cdata_end = ']]>';
86
- cdata = cdata_start (any* -- cdata_end) cdata_end;
109
+
110
+ # Everything except "]" OR a single "]"
111
+ cdata_allowed = (^']'+ | ']') $count_newlines;
87
112
 
88
113
  action start_cdata {
89
- callback(id_on_cdata, data, encoding, ts + 9, te - 3);
114
+ callback_simple(id_on_cdata_start);
115
+
116
+ fnext cdata_body;
90
117
  }
91
118
 
119
+ cdata_body := |*
120
+ cdata_allowed => {
121
+ callback(id_on_cdata_body, data, encoding, ts, te);
122
+
123
+ if ( lines > 0 )
124
+ {
125
+ advance_line(lines);
126
+
127
+ lines = 0;
128
+ }
129
+ };
130
+
131
+ cdata_end => {
132
+ callback_simple(id_on_cdata_end);
133
+
134
+ fnext main;
135
+ };
136
+ *|;
137
+
92
138
  # Processing Instructions
93
139
  #
94
140
  # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
@@ -103,26 +149,33 @@
103
149
  proc_ins_start = '<?' identifier;
104
150
  proc_ins_end = '?>';
105
151
 
152
+ # Everything except "?" OR a single "?"
153
+ proc_ins_allowed = (^'?'+ | '?') $count_newlines;
154
+
106
155
  action start_proc_ins {
107
156
  callback_simple(id_on_proc_ins_start);
108
157
  callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
109
158
 
110
- mark = te;
111
-
112
159
  fnext proc_ins_body;
113
160
  }
114
161
 
115
162
  proc_ins_body := |*
163
+ proc_ins_allowed => {
164
+ callback(id_on_proc_ins_body, data, encoding, ts, te);
165
+
166
+ if ( lines > 0 )
167
+ {
168
+ advance_line(lines);
169
+
170
+ lines = 0;
171
+ }
172
+ };
173
+
116
174
  proc_ins_end => {
117
- callback(id_on_text, data, encoding, mark, ts);
118
175
  callback_simple(id_on_proc_ins_end);
119
176
 
120
- mark = 0;
121
-
122
177
  fnext main;
123
178
  };
124
-
125
- any;
126
179
  *|;
127
180
 
128
181
  # Strings
@@ -253,19 +306,34 @@
253
306
  # Machine that processes the contents of an XML declaration tag.
254
307
  xml_decl := |*
255
308
  xml_decl_end => {
309
+ if ( lines > 0 )
310
+ {
311
+ advance_line(lines);
312
+
313
+ lines = 0;
314
+ }
315
+
256
316
  callback_simple(id_on_xml_decl_end);
317
+
257
318
  fnext main;
258
319
  };
259
320
 
260
321
  # Attributes and their values (e.g. version="1.0").
261
322
  identifier => {
323
+ if ( lines > 0 )
324
+ {
325
+ advance_line(lines);
326
+
327
+ lines = 0;
328
+ }
329
+
262
330
  callback(id_on_attribute, data, encoding, ts, te);
263
331
  };
264
332
 
265
333
  squote => start_string_squote;
266
334
  dquote => start_string_dquote;
267
335
 
268
- any;
336
+ any $count_newlines;
269
337
  *|;
270
338
 
271
339
  # Elements
@@ -302,10 +370,49 @@
302
370
  };
303
371
  *|;
304
372
 
373
+ action hold_start_element_head {
374
+ fhold;
375
+ fnext element_head;
376
+ }
377
+
378
+ # Characters that can be used for unquoted HTML attribute values.
379
+ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
380
+ # for more info.
381
+ html_unquoted_value = ^(
382
+ squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
383
+ )+;
384
+
385
+ # Machine used for processing HTML attribute values.
386
+ html_attribute_value := |*
387
+ squote | dquote => {
388
+ fhold;
389
+ fnext xml_attribute_value;
390
+ };
391
+
392
+ # Unquoted attribute values are lexed as if they were single quoted
393
+ # strings.
394
+ html_unquoted_value => {
395
+ callback_simple(id_on_string_squote);
396
+
397
+ callback(id_on_string_body, data, encoding, ts, te);
398
+
399
+ callback_simple(id_on_string_squote);
400
+ };
401
+
402
+ any => hold_start_element_head;
403
+ *|;
404
+
405
+ # Machine used for processing XML attribute values.
406
+ xml_attribute_value := |*
407
+ squote => start_string_squote;
408
+ dquote => start_string_dquote;
409
+ any => hold_start_element_head;
410
+ *|;
411
+
305
412
  # Machine used for processing the contents of an element's starting tag.
306
413
  # This includes the name, namespace and attributes.
307
414
  element_head := |*
308
- whitespace | '=';
415
+ whitespace;
309
416
 
310
417
  newline => {
311
418
  callback_simple(id_advance_line);
@@ -321,8 +428,16 @@
321
428
  };
322
429
 
323
430
  # Attribute values.
324
- squote => start_string_squote;
325
- dquote => start_string_dquote;
431
+ '=' => {
432
+ if ( html_p )
433
+ {
434
+ fnext html_attribute_value;
435
+ }
436
+ else
437
+ {
438
+ fnext xml_attribute_value;
439
+ }
440
+ };
326
441
 
327
442
  # We're done with the open tag of the element.
328
443
  '>' => {
@@ -438,8 +553,8 @@
438
553
  main := |*
439
554
  doctype_start => start_doctype;
440
555
  xml_decl_start => start_xml_decl;
441
- comment => start_comment;
442
- cdata => start_cdata;
556
+ comment_start => start_comment;
557
+ cdata_start => start_cdata;
443
558
  proc_ins_start => start_proc_ins;
444
559
  element_start => start_element;
445
560
  element_end => close_element;
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.3.1'
2
+ VERSION = '0.3.2'
3
3
  end # Oga
@@ -12,7 +12,7 @@ module Oga
12
12
  # The XML declaration of the document.
13
13
  # @return [Oga::XML::XmlDeclaration]
14
14
  #
15
- # @!attribute [rw] type
15
+ # @!attribute [r] type
16
16
  # The document type, either `:xml` or `:html`.
17
17
  # @return [Symbol]
18
18
  #
@@ -20,7 +20,9 @@ module Oga
20
20
  include Querying
21
21
  include Traversal
22
22
 
23
- attr_accessor :doctype, :xml_declaration, :type
23
+ attr_accessor :doctype, :xml_declaration
24
+
25
+ attr_reader :type
24
26
 
25
27
  ##
26
28
  # @param [Hash] options
@@ -8,7 +8,7 @@ module Oga
8
8
  # The name of the element.
9
9
  # @return [String]
10
10
  #
11
- # @!attribute [ww] namespace_name
11
+ # @!attribute [r] namespace_name
12
12
  # The name of the namespace.
13
13
  # @return [String]
14
14
  #
@@ -23,7 +23,9 @@ module Oga
23
23
  class Element < Node
24
24
  include Querying
25
25
 
26
- attr_accessor :name, :namespace_name, :attributes
26
+ attr_reader :namespace_name
27
+
28
+ attr_accessor :name, :attributes
27
29
 
28
30
  attr_writer :namespaces
29
31
 
@@ -56,6 +58,14 @@ module Oga
56
58
  register_namespaces_from_attributes
57
59
  end
58
60
 
61
+ ##
62
+ # @param [String] name
63
+ #
64
+ def namespace_name=(name)
65
+ @namespace_name = name
66
+ @namespace = nil
67
+ end
68
+
59
69
  ##
60
70
  # Returns an attribute matching the given name (with or without the
61
71
  # namespace).
@@ -289,14 +299,17 @@ module Oga
289
299
  #
290
300
  # @param [String] name
291
301
  # @param [String] uri
302
+ # @param [TrueClass|FalseClass] flush
292
303
  # @see [Oga::XML::Namespace#initialize]
293
304
  #
294
- def register_namespace(name, uri)
305
+ def register_namespace(name, uri, flush = true)
295
306
  if namespaces[name]
296
307
  raise ArgumentError, "The namespace #{name.inspect} already exists"
297
308
  end
298
309
 
299
310
  namespaces[name] = Namespace.new(:name => name, :uri => uri)
311
+
312
+ flush_namespaces_cache if flush
300
313
  end
301
314
 
302
315
  ##
@@ -306,20 +319,25 @@ module Oga
306
319
  # @return [Hash]
307
320
  #
308
321
  def available_namespaces
309
- return {} if html? # HTML(5) completely ignores namespaces
310
-
311
- merged = namespaces.dup
312
- node = parent
313
-
314
- while node && node.respond_to?(:namespaces)
315
- node.namespaces.each do |prefix, ns|
316
- merged[prefix] = ns unless merged[prefix]
322
+ # HTML(5) completely ignores namespaces
323
+ if html?
324
+ return @available_namespaces ||= {}
325
+ elsif !@available_namespaces
326
+ merged = namespaces.dup
327
+ node = parent
328
+
329
+ while node && node.respond_to?(:namespaces)
330
+ node.namespaces.each do |prefix, ns|
331
+ merged[prefix] = ns unless merged[prefix]
332
+ end
333
+
334
+ node = node.parent
317
335
  end
318
336
 
319
- node = node.parent
337
+ @available_namespaces = merged
320
338
  end
321
339
 
322
- return merged
340
+ return @available_namespaces
323
341
  end
324
342
 
325
343
  ##
@@ -339,19 +357,40 @@ module Oga
339
357
  return self_closing
340
358
  end
341
359
 
360
+ ##
361
+ # Flushes the namespaces cache of the current element and all its child
362
+ # elements.
363
+ #
364
+ def flush_namespaces_cache
365
+ @available_namespaces = nil
366
+ @namespace = nil
367
+
368
+ children.each do |child|
369
+ child.flush_namespaces_cache if child.is_a?(Element)
370
+ end
371
+ end
372
+
342
373
  private
343
374
 
344
375
  ##
345
376
  # Registers namespaces based on any "xmlns" attributes.
346
377
  #
347
378
  def register_namespaces_from_attributes
379
+ flush = false
380
+
348
381
  attributes.each do |attr|
349
382
  # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
350
383
  # is not a registered namespace.
351
384
  if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
352
- register_namespace(attr.name, attr.value)
385
+ flush = true
386
+
387
+ # Ensures we only flush the cache once instead of flushing it on
388
+ # every register_namespace call.
389
+ register_namespace(attr.name, attr.value, false)
353
390
  end
354
391
  end
392
+
393
+ flush_namespaces_cache if flush
355
394
  end
356
395
 
357
396
  ##
@@ -262,10 +262,40 @@ module Oga
262
262
  end
263
263
 
264
264
  ##
265
- # Called on a CDATA tag.
265
+ # Called on the open CDATA tag.
266
266
  #
267
- def on_cdata(value)
268
- add_token(:T_CDATA, value)
267
+ def on_cdata_start
268
+ add_token(:T_CDATA_START)
269
+ end
270
+
271
+ ##
272
+ # Called on the closing CDATA tag.
273
+ #
274
+ def on_cdata_end
275
+ add_token(:T_CDATA_END)
276
+ end
277
+
278
+ ##
279
+ # Called for the body of a CDATA tag.
280
+ #
281
+ # @param [String] value
282
+ #
283
+ def on_cdata_body(value)
284
+ add_token(:T_CDATA_BODY, value)
285
+ end
286
+
287
+ ##
288
+ # Called on the open comment tag.
289
+ #
290
+ def on_comment_start
291
+ add_token(:T_COMMENT_START)
292
+ end
293
+
294
+ ##
295
+ # Called on the closing comment tag.
296
+ #
297
+ def on_comment_end
298
+ add_token(:T_COMMENT_END)
269
299
  end
270
300
 
271
301
  ##
@@ -273,8 +303,8 @@ module Oga
273
303
  #
274
304
  # @param [String] value
275
305
  #
276
- def on_comment(value)
277
- add_token(:T_COMMENT, value)
306
+ def on_comment_body(value)
307
+ add_token(:T_COMMENT_BODY, value)
278
308
  end
279
309
 
280
310
  ##
@@ -314,6 +344,15 @@ module Oga
314
344
  add_token(:T_PROC_INS_NAME, value)
315
345
  end
316
346
 
347
+ ##
348
+ # Called on the body of a processing instruction.
349
+ #
350
+ # @param [String] value
351
+ #
352
+ def on_proc_ins_body(value)
353
+ add_token(:T_PROC_INS_BODY, value)
354
+ end
355
+
317
356
  ##
318
357
  # Called on the end of a processing instruction.
319
358
  #