oga 0.3.1-java → 0.3.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
89
89
  @JRubyMethod
90
90
  public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
91
91
  {
92
+ Boolean html_p = this.callMethod(context, "html").isTrue();
93
+
92
94
  Encoding encoding = rb_str.getEncoding();
93
95
 
94
96
  byte[] data = rb_str.getBytes();
@@ -104,8 +106,12 @@ public class Lexer extends RubyObject
104
106
  String id_advance_line = "advance_line";
105
107
  String id_on_attribute = "on_attribute";
106
108
  String id_on_attribute_ns = "on_attribute_ns";
107
- String id_on_cdata = "on_cdata";
108
- String id_on_comment = "on_comment";
109
+ String id_on_cdata_start = "on_cdata_start";
110
+ String id_on_cdata_body = "on_cdata_body";
111
+ String id_on_cdata_end = "on_cdata_end";
112
+ String id_on_comment_start = "on_comment_start";
113
+ String id_on_comment_body = "on_comment_body";
114
+ String id_on_comment_end = "on_comment_end";
109
115
  String id_on_doctype_end = "on_doctype_end";
110
116
  String id_on_doctype_inline = "on_doctype_inline";
111
117
  String id_on_doctype_name = "on_doctype_name";
@@ -119,6 +125,7 @@ public class Lexer extends RubyObject
119
125
  String id_on_proc_ins_end = "on_proc_ins_end";
120
126
  String id_on_proc_ins_name = "on_proc_ins_name";
121
127
  String id_on_proc_ins_start = "on_proc_ins_start";
128
+ String id_on_proc_ins_body = "on_proc_ins_body";
122
129
  String id_on_string_body = "on_string_body";
123
130
  String id_on_string_dquote = "on_string_dquote";
124
131
  String id_on_string_squote = "on_string_squote";
@@ -67,12 +67,35 @@
67
67
 
68
68
  comment_start = '<!--';
69
69
  comment_end = '-->';
70
- comment = comment_start (any* -- comment_end) comment_end;
70
+
71
+ # Everything except "-" OR a single "-"
72
+ comment_allowed = (^'-'+ | '-') $count_newlines;
71
73
 
72
74
  action start_comment {
73
- callback(id_on_comment, data, encoding, ts + 4, te - 3);
75
+ callback_simple(id_on_comment_start);
76
+
77
+ fnext comment_body;
74
78
  }
75
79
 
80
+ comment_body := |*
81
+ comment_allowed => {
82
+ callback(id_on_comment_body, data, encoding, ts, te);
83
+
84
+ if ( lines > 0 )
85
+ {
86
+ advance_line(lines);
87
+
88
+ lines = 0;
89
+ }
90
+ };
91
+
92
+ comment_end => {
93
+ callback_simple(id_on_comment_end);
94
+
95
+ fnext main;
96
+ };
97
+ *|;
98
+
76
99
  # CDATA
77
100
  #
78
101
  # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
@@ -83,12 +106,35 @@
83
106
 
84
107
  cdata_start = '<![CDATA[';
85
108
  cdata_end = ']]>';
86
- cdata = cdata_start (any* -- cdata_end) cdata_end;
109
+
110
+ # Everything except "]" OR a single "]"
111
+ cdata_allowed = (^']'+ | ']') $count_newlines;
87
112
 
88
113
  action start_cdata {
89
- callback(id_on_cdata, data, encoding, ts + 9, te - 3);
114
+ callback_simple(id_on_cdata_start);
115
+
116
+ fnext cdata_body;
90
117
  }
91
118
 
119
+ cdata_body := |*
120
+ cdata_allowed => {
121
+ callback(id_on_cdata_body, data, encoding, ts, te);
122
+
123
+ if ( lines > 0 )
124
+ {
125
+ advance_line(lines);
126
+
127
+ lines = 0;
128
+ }
129
+ };
130
+
131
+ cdata_end => {
132
+ callback_simple(id_on_cdata_end);
133
+
134
+ fnext main;
135
+ };
136
+ *|;
137
+
92
138
  # Processing Instructions
93
139
  #
94
140
  # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
@@ -103,26 +149,33 @@
103
149
  proc_ins_start = '<?' identifier;
104
150
  proc_ins_end = '?>';
105
151
 
152
+ # Everything except "?" OR a single "?"
153
+ proc_ins_allowed = (^'?'+ | '?') $count_newlines;
154
+
106
155
  action start_proc_ins {
107
156
  callback_simple(id_on_proc_ins_start);
108
157
  callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
109
158
 
110
- mark = te;
111
-
112
159
  fnext proc_ins_body;
113
160
  }
114
161
 
115
162
  proc_ins_body := |*
163
+ proc_ins_allowed => {
164
+ callback(id_on_proc_ins_body, data, encoding, ts, te);
165
+
166
+ if ( lines > 0 )
167
+ {
168
+ advance_line(lines);
169
+
170
+ lines = 0;
171
+ }
172
+ };
173
+
116
174
  proc_ins_end => {
117
- callback(id_on_text, data, encoding, mark, ts);
118
175
  callback_simple(id_on_proc_ins_end);
119
176
 
120
- mark = 0;
121
-
122
177
  fnext main;
123
178
  };
124
-
125
- any;
126
179
  *|;
127
180
 
128
181
  # Strings
@@ -253,19 +306,34 @@
253
306
  # Machine that processes the contents of an XML declaration tag.
254
307
  xml_decl := |*
255
308
  xml_decl_end => {
309
+ if ( lines > 0 )
310
+ {
311
+ advance_line(lines);
312
+
313
+ lines = 0;
314
+ }
315
+
256
316
  callback_simple(id_on_xml_decl_end);
317
+
257
318
  fnext main;
258
319
  };
259
320
 
260
321
  # Attributes and their values (e.g. version="1.0").
261
322
  identifier => {
323
+ if ( lines > 0 )
324
+ {
325
+ advance_line(lines);
326
+
327
+ lines = 0;
328
+ }
329
+
262
330
  callback(id_on_attribute, data, encoding, ts, te);
263
331
  };
264
332
 
265
333
  squote => start_string_squote;
266
334
  dquote => start_string_dquote;
267
335
 
268
- any;
336
+ any $count_newlines;
269
337
  *|;
270
338
 
271
339
  # Elements
@@ -302,10 +370,49 @@
302
370
  };
303
371
  *|;
304
372
 
373
+ action hold_start_element_head {
374
+ fhold;
375
+ fnext element_head;
376
+ }
377
+
378
+ # Characters that can be used for unquoted HTML attribute values.
379
+ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
380
+ # for more info.
381
+ html_unquoted_value = ^(
382
+ squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
383
+ )+;
384
+
385
+ # Machine used for processing HTML attribute values.
386
+ html_attribute_value := |*
387
+ squote | dquote => {
388
+ fhold;
389
+ fnext xml_attribute_value;
390
+ };
391
+
392
+ # Unquoted attribute values are lexed as if they were single quoted
393
+ # strings.
394
+ html_unquoted_value => {
395
+ callback_simple(id_on_string_squote);
396
+
397
+ callback(id_on_string_body, data, encoding, ts, te);
398
+
399
+ callback_simple(id_on_string_squote);
400
+ };
401
+
402
+ any => hold_start_element_head;
403
+ *|;
404
+
405
+ # Machine used for processing XML attribute values.
406
+ xml_attribute_value := |*
407
+ squote => start_string_squote;
408
+ dquote => start_string_dquote;
409
+ any => hold_start_element_head;
410
+ *|;
411
+
305
412
  # Machine used for processing the contents of an element's starting tag.
306
413
  # This includes the name, namespace and attributes.
307
414
  element_head := |*
308
- whitespace | '=';
415
+ whitespace;
309
416
 
310
417
  newline => {
311
418
  callback_simple(id_advance_line);
@@ -321,8 +428,16 @@
321
428
  };
322
429
 
323
430
  # Attribute values.
324
- squote => start_string_squote;
325
- dquote => start_string_dquote;
431
+ '=' => {
432
+ if ( html_p )
433
+ {
434
+ fnext html_attribute_value;
435
+ }
436
+ else
437
+ {
438
+ fnext xml_attribute_value;
439
+ }
440
+ };
326
441
 
327
442
  # We're done with the open tag of the element.
328
443
  '>' => {
@@ -438,8 +553,8 @@
438
553
  main := |*
439
554
  doctype_start => start_doctype;
440
555
  xml_decl_start => start_xml_decl;
441
- comment => start_comment;
442
- cdata => start_cdata;
556
+ comment_start => start_comment;
557
+ cdata_start => start_cdata;
443
558
  proc_ins_start => start_proc_ins;
444
559
  element_start => start_element;
445
560
  element_end => close_element;
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.3.1'
2
+ VERSION = '0.3.2'
3
3
  end # Oga
@@ -12,7 +12,7 @@ module Oga
12
12
  # The XML declaration of the document.
13
13
  # @return [Oga::XML::XmlDeclaration]
14
14
  #
15
- # @!attribute [rw] type
15
+ # @!attribute [r] type
16
16
  # The document type, either `:xml` or `:html`.
17
17
  # @return [Symbol]
18
18
  #
@@ -20,7 +20,9 @@ module Oga
20
20
  include Querying
21
21
  include Traversal
22
22
 
23
- attr_accessor :doctype, :xml_declaration, :type
23
+ attr_accessor :doctype, :xml_declaration
24
+
25
+ attr_reader :type
24
26
 
25
27
  ##
26
28
  # @param [Hash] options
@@ -8,7 +8,7 @@ module Oga
8
8
  # The name of the element.
9
9
  # @return [String]
10
10
  #
11
- # @!attribute [ww] namespace_name
11
+ # @!attribute [r] namespace_name
12
12
  # The name of the namespace.
13
13
  # @return [String]
14
14
  #
@@ -23,7 +23,9 @@ module Oga
23
23
  class Element < Node
24
24
  include Querying
25
25
 
26
- attr_accessor :name, :namespace_name, :attributes
26
+ attr_reader :namespace_name
27
+
28
+ attr_accessor :name, :attributes
27
29
 
28
30
  attr_writer :namespaces
29
31
 
@@ -56,6 +58,14 @@ module Oga
56
58
  register_namespaces_from_attributes
57
59
  end
58
60
 
61
+ ##
62
+ # @param [String] name
63
+ #
64
+ def namespace_name=(name)
65
+ @namespace_name = name
66
+ @namespace = nil
67
+ end
68
+
59
69
  ##
60
70
  # Returns an attribute matching the given name (with or without the
61
71
  # namespace).
@@ -289,14 +299,17 @@ module Oga
289
299
  #
290
300
  # @param [String] name
291
301
  # @param [String] uri
302
+ # @param [TrueClass|FalseClass] flush
292
303
  # @see [Oga::XML::Namespace#initialize]
293
304
  #
294
- def register_namespace(name, uri)
305
+ def register_namespace(name, uri, flush = true)
295
306
  if namespaces[name]
296
307
  raise ArgumentError, "The namespace #{name.inspect} already exists"
297
308
  end
298
309
 
299
310
  namespaces[name] = Namespace.new(:name => name, :uri => uri)
311
+
312
+ flush_namespaces_cache if flush
300
313
  end
301
314
 
302
315
  ##
@@ -306,20 +319,25 @@ module Oga
306
319
  # @return [Hash]
307
320
  #
308
321
  def available_namespaces
309
- return {} if html? # HTML(5) completely ignores namespaces
310
-
311
- merged = namespaces.dup
312
- node = parent
313
-
314
- while node && node.respond_to?(:namespaces)
315
- node.namespaces.each do |prefix, ns|
316
- merged[prefix] = ns unless merged[prefix]
322
+ # HTML(5) completely ignores namespaces
323
+ if html?
324
+ return @available_namespaces ||= {}
325
+ elsif !@available_namespaces
326
+ merged = namespaces.dup
327
+ node = parent
328
+
329
+ while node && node.respond_to?(:namespaces)
330
+ node.namespaces.each do |prefix, ns|
331
+ merged[prefix] = ns unless merged[prefix]
332
+ end
333
+
334
+ node = node.parent
317
335
  end
318
336
 
319
- node = node.parent
337
+ @available_namespaces = merged
320
338
  end
321
339
 
322
- return merged
340
+ return @available_namespaces
323
341
  end
324
342
 
325
343
  ##
@@ -339,19 +357,40 @@ module Oga
339
357
  return self_closing
340
358
  end
341
359
 
360
+ ##
361
+ # Flushes the namespaces cache of the current element and all its child
362
+ # elements.
363
+ #
364
+ def flush_namespaces_cache
365
+ @available_namespaces = nil
366
+ @namespace = nil
367
+
368
+ children.each do |child|
369
+ child.flush_namespaces_cache if child.is_a?(Element)
370
+ end
371
+ end
372
+
342
373
  private
343
374
 
344
375
  ##
345
376
  # Registers namespaces based on any "xmlns" attributes.
346
377
  #
347
378
  def register_namespaces_from_attributes
379
+ flush = false
380
+
348
381
  attributes.each do |attr|
349
382
  # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
350
383
  # is not a registered namespace.
351
384
  if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
352
- register_namespace(attr.name, attr.value)
385
+ flush = true
386
+
387
+ # Ensures we only flush the cache once instead of flushing it on
388
+ # every register_namespace call.
389
+ register_namespace(attr.name, attr.value, false)
353
390
  end
354
391
  end
392
+
393
+ flush_namespaces_cache if flush
355
394
  end
356
395
 
357
396
  ##
data/lib/oga/xml/lexer.rb CHANGED
@@ -262,10 +262,40 @@ module Oga
262
262
  end
263
263
 
264
264
  ##
265
- # Called on a CDATA tag.
265
+ # Called on the open CDATA tag.
266
266
  #
267
- def on_cdata(value)
268
- add_token(:T_CDATA, value)
267
+ def on_cdata_start
268
+ add_token(:T_CDATA_START)
269
+ end
270
+
271
+ ##
272
+ # Called on the closing CDATA tag.
273
+ #
274
+ def on_cdata_end
275
+ add_token(:T_CDATA_END)
276
+ end
277
+
278
+ ##
279
+ # Called for the body of a CDATA tag.
280
+ #
281
+ # @param [String] value
282
+ #
283
+ def on_cdata_body(value)
284
+ add_token(:T_CDATA_BODY, value)
285
+ end
286
+
287
+ ##
288
+ # Called on the open comment tag.
289
+ #
290
+ def on_comment_start
291
+ add_token(:T_COMMENT_START)
292
+ end
293
+
294
+ ##
295
+ # Called on the closing comment tag.
296
+ #
297
+ def on_comment_end
298
+ add_token(:T_COMMENT_END)
269
299
  end
270
300
 
271
301
  ##
@@ -273,8 +303,8 @@ module Oga
273
303
  #
274
304
  # @param [String] value
275
305
  #
276
- def on_comment(value)
277
- add_token(:T_COMMENT, value)
306
+ def on_comment_body(value)
307
+ add_token(:T_COMMENT_BODY, value)
278
308
  end
279
309
 
280
310
  ##
@@ -314,6 +344,15 @@ module Oga
314
344
  add_token(:T_PROC_INS_NAME, value)
315
345
  end
316
346
 
347
+ ##
348
+ # Called on the body of a processing instruction.
349
+ #
350
+ # @param [String] value
351
+ #
352
+ def on_proc_ins_body(value)
353
+ add_token(:T_PROC_INS_BODY, value)
354
+ end
355
+
317
356
  ##
318
357
  # Called on the end of a processing instruction.
319
358
  #