nokogiri 1.5.6.rc2-java → 1.5.6.rc3-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

@@ -45,7 +45,6 @@ import java.nio.charset.CharsetEncoder;
45
45
  import java.util.ArrayList;
46
46
  import java.util.List;
47
47
  import java.util.Set;
48
- import java.util.SortedMap;
49
48
  import java.util.regex.Matcher;
50
49
  import java.util.regex.Pattern;
51
50
 
@@ -685,9 +684,9 @@ public class NokogiriHelpers {
685
684
  if (name == null) name = "UTF-8";
686
685
  return name;
687
686
  }
688
-
689
- private static Set<String> charsetNames = ((SortedMap<String, Charset>)Charset.availableCharsets()).keySet();
690
-
687
+
688
+ private static Set<String> charsetNames = Charset.availableCharsets().keySet();
689
+
691
690
  private static String ignoreInvalidEncoding(Ruby runtime, IRubyObject encoding) {
692
691
  String givenEncoding = rubyStringToString(encoding);
693
692
  if (charsetNames.contains(givenEncoding)) return givenEncoding;
@@ -807,4 +806,13 @@ public class NokogiriHelpers {
807
806
  private static Charset shift_jis = Charset.forName("Shift_JIS");
808
807
  private static Charset jis = Charset.forName("ISO-2022-JP");
809
808
  private static Charset euc_jp = Charset.forName("EUC-JP");
809
+
810
+ public static boolean shouldEncode(Node text) {
811
+ return text.getUserData(NokogiriHelpers.ENCODED_STRING) == null ||
812
+ !((Boolean)text.getUserData(NokogiriHelpers.ENCODED_STRING));
813
+ }
814
+
815
+ public static boolean shouldDecode(Node text) {
816
+ return !shouldEncode(text);
817
+ }
810
818
  }
@@ -108,7 +108,8 @@ public class ParserContext extends RubyObject {
108
108
  (RubyIO) TypeConverter.convertToType(data,
109
109
  ruby.getIO(),
110
110
  "to_io");
111
- source.setByteStream(io.getInStream());
111
+ // use unclosedable input stream to fix #495
112
+ source.setByteStream(new UncloseableInputStream(io.getInStream()));
112
113
  } else {
113
114
  if (invoke(context, data, "respond_to?",
114
115
  ruby.newSymbol("string").to_sym()).isTrue()) {
@@ -17,10 +17,10 @@
17
17
  * distribute, sublicense, and/or sell copies of the Software, and to
18
18
  * permit persons to whom the Software is furnished to do so, subject to
19
19
  * the following conditions:
20
- *
20
+ *
21
21
  * The above copyright notice and this permission notice shall be
22
22
  * included in all copies or substantial portions of the Software.
23
- *
23
+ *
24
24
  * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
25
25
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26
26
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -65,21 +65,33 @@ import org.w3c.dom.Text;
65
65
 
66
66
  /**
67
67
  * A class for serializing a document.
68
- *
68
+ *
69
69
  * @author sergio
70
70
  * @author Patrick Mahoney <pat@polycrystal.org>
71
71
  * @author Yoko Harada <yokolet@gmail.com>
72
72
  */
73
73
  public class SaveContextVisitor {
74
74
 
75
- private StringBuffer buffer;
76
- private Stack<String> indentation;
77
- private String encoding, indentString;
78
- private boolean format, noDecl, noEmpty, noXhtml, asXhtml, asXml, asHtml, asBuilder, htmlDoc, fragment;
79
- private boolean canonical, incl_ns, with_comments, subsets, exclusive;
80
- private List<Node> c14nNodeList;
81
- private Deque<Attr[]> c14nNamespaceStack;
82
- private Deque<Attr[]> c14nAttrStack;
75
+ private final StringBuffer buffer;
76
+ private final Stack<String> indentation;
77
+ private String encoding;
78
+ private final String indentString;
79
+ private boolean format;
80
+ private final boolean noDecl;
81
+ private final boolean noEmpty;
82
+ private final boolean noXhtml;
83
+ private final boolean asXhtml;
84
+ private boolean asXml;
85
+ private final boolean asHtml;
86
+ private final boolean asBuilder;
87
+ private boolean htmlDoc;
88
+ private final boolean fragment;
89
+ private final boolean canonical, incl_ns, with_comments;
90
+ private boolean subsets;
91
+ private boolean exclusive;
92
+ private final List<Node> c14nNodeList;
93
+ private final Deque<Attr[]> c14nNamespaceStack;
94
+ private final Deque<Attr[]> c14nAttrStack;
83
95
  private List<String> c14nExclusiveInclusivePrefixes = null;
84
96
  /*
85
97
  * U can't touch this.
@@ -96,7 +108,7 @@ public class SaveContextVisitor {
96
108
  public static final int AS_XML = 32;
97
109
  public static final int AS_HTML = 64;
98
110
  public static final int AS_BUILDER = 128;
99
-
111
+
100
112
  public static final int CANONICAL = 1;
101
113
  public static final int INCL_NS = 2;
102
114
  public static final int WITH_COMMENTS = 4;
@@ -113,7 +125,7 @@ public class SaveContextVisitor {
113
125
  c14nNamespaceStack = new ArrayDeque<Attr[]>();
114
126
  c14nAttrStack = new ArrayDeque<Attr[]>();
115
127
  format = (options & FORMAT) == FORMAT;
116
-
128
+
117
129
  noDecl = (options & NO_DECL) == NO_DECL;
118
130
  noEmpty = (options & NO_EMPTY) == NO_EMPTY;
119
131
  noXhtml = (options & NO_XHTML) == NO_XHTML;
@@ -121,40 +133,40 @@ public class SaveContextVisitor {
121
133
  asXml = (options & AS_XML) == AS_XML;
122
134
  asHtml = (options & AS_HTML) == AS_HTML;
123
135
  asBuilder = (options & AS_BUILDER) == AS_BUILDER;
124
-
136
+
125
137
  canonical = (canonicalOpts & CANONICAL) == CANONICAL;
126
138
  incl_ns = (canonicalOpts & INCL_NS) == INCL_NS;
127
139
  with_comments = (canonicalOpts & WITH_COMMENTS) == WITH_COMMENTS;
128
140
  subsets = (canonicalOpts & SUBSETS) == SUBSETS;
129
-
141
+
130
142
  if ((format && indent == null) || (format && indent.length() == 0)) indent = " "; // default, two spaces
131
143
  if ((!format && indent != null) && indent.length() > 0) format = true;
132
144
  if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) indent = " "; // default, two spaces
133
145
  indentString = indent;
134
146
  if (!asXml && !asHtml && !asXhtml && !asBuilder) asXml = true;
135
147
  }
136
-
148
+
137
149
  @Override
138
150
  public String toString() {
139
151
  return (new String(buffer));
140
152
  }
141
-
153
+
142
154
  public void setHtmlDoc(boolean htmlDoc) {
143
155
  this.htmlDoc = htmlDoc;
144
156
  }
145
-
157
+
146
158
  public void setEncoding(String encoding) {
147
159
  this.encoding = encoding;
148
160
  }
149
-
161
+
150
162
  public List<Node> getC14nNodeList() {
151
163
  return c14nNodeList;
152
164
  }
153
-
165
+
154
166
  public void setC14nExclusiveInclusivePrefixes(List<String> prefixes) {
155
167
  c14nExclusiveInclusivePrefixes = prefixes;
156
168
  }
157
-
169
+
158
170
  public boolean enter(Node node) {
159
171
  if (node instanceof Document) {
160
172
  return enter((Document)node);
@@ -181,7 +193,7 @@ public class SaveContextVisitor {
181
193
  return enter((Entity)node);
182
194
  }
183
195
  if (node instanceof EntityReference) {
184
- return enter((EntityReference)node);
196
+ return enter(node);
185
197
  }
186
198
  if (node instanceof Notation) {
187
199
  return enter((Notation)node);
@@ -191,7 +203,7 @@ public class SaveContextVisitor {
191
203
  }
192
204
  return false;
193
205
  }
194
-
206
+
195
207
  public void leave(Node node) {
196
208
  if (node instanceof Document) {
197
209
  leave((Document)node);
@@ -206,7 +218,6 @@ public class SaveContextVisitor {
206
218
  return;
207
219
  }
208
220
  if (node instanceof Text) {
209
- leave((Text)node);
210
221
  return;
211
222
  }
212
223
  if (node instanceof CDATASection) {
@@ -226,7 +237,7 @@ public class SaveContextVisitor {
226
237
  return;
227
238
  }
228
239
  if (node instanceof EntityReference) {
229
- leave((EntityReference)node);
240
+ leave(node);
230
241
  return;
231
242
  }
232
243
  if (node instanceof Notation) {
@@ -238,16 +249,16 @@ public class SaveContextVisitor {
238
249
  return;
239
250
  }
240
251
  }
241
-
252
+
242
253
  public boolean enter(String string) {
243
254
  buffer.append(string);
244
255
  return true;
245
256
  }
246
-
257
+
247
258
  public void leave(String string) {
248
259
  // no-op
249
260
  }
250
-
261
+
251
262
  public boolean enter(Attr attr) {
252
263
  String name = attr.getName();
253
264
  buffer.append(name);
@@ -260,34 +271,34 @@ public class SaveContextVisitor {
260
271
  }
261
272
  return true;
262
273
  }
263
-
264
- private static Pattern p =
274
+
275
+ private static Pattern p =
265
276
  Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE);
266
-
277
+
267
278
  private String replaceCharsetIfNecessary(Attr attr) {
268
279
  String value = attr.getValue();
269
280
  if (encoding == null) return value; // unable to replace in any case
270
281
  if (!"content".equals(attr.getName().toLowerCase())) return value; // must be content attr
271
- if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value;
282
+ if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value;
272
283
  Matcher m = p.matcher(value);
273
284
  if (!m.find()) return value;
274
285
  if (value.contains(encoding)) return value; // no need to replace
275
286
  return value.replace(m.group(), "charset=" + encoding);
276
287
  }
277
-
288
+
278
289
  public static final String[] HTML_BOOLEAN_ATTRS = {
279
290
  "checked", "compact", "declare", "defer", "disabled", "ismap",
280
291
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
281
292
  "selected"
282
293
  };
283
-
294
+
284
295
  private boolean isHtmlBooleanAttr(String name) {
285
296
  for (String s : HTML_BOOLEAN_ATTRS) {
286
297
  if (s.equals(name)) return true;
287
298
  }
288
299
  return false;
289
300
  }
290
-
301
+
291
302
  private String serializeAttrTextContent(String s, boolean htmlDoc) {
292
303
  if (s == null) return "";
293
304
 
@@ -299,7 +310,7 @@ public class SaveContextVisitor {
299
310
  case '\n': buffer.append("&#10;"); break;
300
311
  case '\r': buffer.append("&#13;"); break;
301
312
  case '\t': buffer.append("&#9;"); break;
302
- case '"': if (htmlDoc) buffer.append("%22");
313
+ case '"': if (htmlDoc) buffer.append("%22");
303
314
  else buffer.append("&quot;");
304
315
  break;
305
316
  case '<': buffer.append("&lt;"); break;
@@ -315,14 +326,14 @@ public class SaveContextVisitor {
315
326
  public void leave(Attr attr) {
316
327
  // no-op
317
328
  }
318
-
329
+
319
330
  public boolean enter(CDATASection cdata) {
320
331
  buffer.append("<![CDATA[");
321
332
  buffer.append(cdata.getData());
322
333
  buffer.append("]]>");
323
334
  return true;
324
335
  }
325
-
336
+
326
337
  public void leave(CDATASection cdata) {
327
338
  // no-op
328
339
  }
@@ -337,11 +348,11 @@ public class SaveContextVisitor {
337
348
  buffer.append("-->");
338
349
  return true;
339
350
  }
340
-
351
+
341
352
  public void leave(Comment comment) {
342
353
  // no-op
343
354
  }
344
-
355
+
345
356
  public boolean enter(Document document) {
346
357
  if (!noDecl) {
347
358
  buffer.append("<?xml version=\"");
@@ -357,11 +368,11 @@ public class SaveContextVisitor {
357
368
  }
358
369
  return true;
359
370
  }
360
-
371
+
361
372
  public void leave(Document document) {
362
373
  // no-op
363
374
  }
364
-
375
+
365
376
  public boolean enter(DocumentType docType) {
366
377
  if (canonical) {
367
378
  c14nNodeList.add(docType);
@@ -389,7 +400,7 @@ public class SaveContextVisitor {
389
400
  buffer.append(">\n");
390
401
  return true;
391
402
  }
392
-
403
+
393
404
  public void leave(DocumentType docType) {
394
405
  // no-op
395
406
  }
@@ -403,12 +414,12 @@ public class SaveContextVisitor {
403
414
  }
404
415
  String current = indentation.peek();
405
416
  buffer.append(current);
406
- if (needIndent()) {
417
+ if (needIndent(element)) {
407
418
  indentation.push(current + indentString);
408
419
  }
409
420
  String name = element.getTagName();
410
421
  buffer.append("<" + name);
411
- Attr[] attrs = getAttrsAndNamespaces(element);
422
+ Attr[] attrs = getAttrsAndNamespaces(element);
412
423
  for (Attr attr : attrs) {
413
424
  if (attr.getSpecified()) {
414
425
  buffer.append(" ");
@@ -423,7 +434,7 @@ public class SaveContextVisitor {
423
434
  }
424
435
  // no child
425
436
  if (asHtml || asXhtml) {
426
- buffer.append(">");
437
+ buffer.append(">");
427
438
  } else if (asXml && noEmpty) {
428
439
  buffer.append(">");
429
440
  } else {
@@ -434,26 +445,28 @@ public class SaveContextVisitor {
434
445
  }
435
446
  return true;
436
447
  }
437
-
438
- private boolean needIndent() {
448
+
449
+ private boolean needIndent(Element element) {
450
+ if (containsText(element)) return false;
439
451
  if (fragment) return false; // a given option might be fragment and format. fragment matters
440
452
  if (format || asBuilder) return true;
441
453
  return false;
442
454
  }
443
-
455
+
444
456
  private boolean needBreakInOpening(Element element) {
457
+ if (containsText(element)) return false;
445
458
  if (fragment) return false;
446
459
  if (format) return true;
447
460
  if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
448
461
  if (format && element.getNextSibling() == null && element.hasChildNodes()) return true;
449
462
  return false;
450
463
  }
451
-
464
+
452
465
  private boolean isEmpty(String name) {
453
466
  HTMLElements.Element element = HTMLElements.getElement(name);
454
467
  return element.isEmpty();
455
468
  }
456
-
469
+
457
470
  private Attr[] getAttrsAndNamespaces(Element element) {
458
471
  NamedNodeMap attrs = element.getAttributes();
459
472
  if (!canonical) {
@@ -474,7 +487,7 @@ public class SaveContextVisitor {
474
487
  c14nAttrStack.push(attributeOfAncestors);
475
488
  subsets = false; // namespace propagation should be done only once on top level node.
476
489
  }
477
-
490
+
478
491
  getNamespacesAndAttrs(element, namespaces, attributes);
479
492
 
480
493
  Attr[] namespaceArray = getSortedArray(namespaces);
@@ -491,9 +504,9 @@ public class SaveContextVisitor {
491
504
  c14nAttrStack.push(attributeArray);
492
505
  return allAttrs;
493
506
  }
494
-
507
+
495
508
  }
496
-
509
+
497
510
  private void getAttrsOfAncestors(Node parent, List<Attr> namespaces, List<Attr> attributes) {
498
511
  if (parent == null) return;
499
512
  NamedNodeMap attrs = parent.getAttributes();
@@ -505,7 +518,7 @@ public class SaveContextVisitor {
505
518
  }
506
519
  getAttrsOfAncestors(parent.getParentNode(), namespaces, attributes);
507
520
  }
508
-
521
+
509
522
  private void getNamespacesAndAttrs(Node current, List<Attr> namespaces, List<Attr> attributes) {
510
523
  NamedNodeMap attrs = current.getAttributes();
511
524
  for (int i=0; i<attrs.getLength(); i++) {
@@ -531,7 +544,7 @@ public class SaveContextVisitor {
531
544
  if (parentNamespaces[n].getNodeValue().equals(attr.getNodeValue())) {
532
545
  // exactly the same namespace should not be added
533
546
  newNamespace = false;
534
- } else {
547
+ } else {
535
548
  // in case of namespace url change, propagated namespace will be override
536
549
  namespaces.remove(parentNamespaces[n]);
537
550
  }
@@ -540,7 +553,7 @@ public class SaveContextVisitor {
540
553
  if (newNamespace && !namespaces.contains(attr)) namespaces.add(attr);
541
554
  }
542
555
  }
543
-
556
+
544
557
  private void getAttributesWithPropagated(List<Attr> attributes, Attr attr) {
545
558
  boolean newAttribute = true;
546
559
  Iterator<Attr[]> iter = c14nAttrStack.iterator();
@@ -552,7 +565,7 @@ public class SaveContextVisitor {
552
565
  if (parentAttr[n].getNodeValue().equals(attr.getNodeValue())) {
553
566
  // exactly the same attribute should not be added
554
567
  newAttribute = false;
555
- } else {
568
+ } else {
556
569
  // in case of attribute value change, propagated attribute will be override
557
570
  attributes.remove(parentAttr[n]);
558
571
  }
@@ -561,7 +574,7 @@ public class SaveContextVisitor {
561
574
  if (newAttribute) attributes.add(attr);
562
575
  }
563
576
  }
564
-
577
+
565
578
  private void verifyXmlSpace(List<Attr> attributes, NamedNodeMap attrs) {
566
579
  Attr attr = (Attr) attrs.getNamedItem("xml:space");
567
580
  if (attr == null) {
@@ -573,7 +586,7 @@ public class SaveContextVisitor {
573
586
  }
574
587
  }
575
588
  }
576
-
589
+
577
590
  private Attr[] getSortedArray(List<Attr> attrList) {
578
591
  Attr[] attrArray = attrList.toArray(new Attr[0]);
579
592
  Arrays.sort(attrArray, new Comparator<Attr>() {
@@ -584,7 +597,7 @@ public class SaveContextVisitor {
584
597
  });
585
598
  return attrArray;
586
599
  }
587
-
600
+
588
601
  public void leave(Element element) {
589
602
  if (canonical) {
590
603
  c14nNamespaceStack.poll();
@@ -596,10 +609,10 @@ public class SaveContextVisitor {
596
609
  indentation.pop();
597
610
  buffer.append(indentation.peek());
598
611
  } else if (asBuilder) {
599
- indentation.pop();
612
+ if (!containsText(element)) indentation.pop();
600
613
  }
601
614
  buffer.append("</" + name + ">");
602
- if (needBreakInClosing()) {
615
+ if (needBreakInClosing(element)) {
603
616
  buffer.append("\n");
604
617
  }
605
618
  return;
@@ -610,25 +623,31 @@ public class SaveContextVisitor {
610
623
  buffer.append("</" + name + ">");
611
624
  }
612
625
  }
613
- if (needBreakInClosing()) {
614
- indentation.pop();
626
+ if (needBreakInClosing(element)) {
627
+ if (!containsText(element)) indentation.pop();
615
628
  buffer.append("\n");
616
629
  }
617
630
  }
618
-
631
+
619
632
  private boolean needIndentInClosing(Element element) {
633
+ if (containsText(element)) return false;
634
+
620
635
  if (fragment) return false; // a given option might be fragment and format. fragment matters
621
636
  if (format) return true;
622
637
  if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
623
638
  return false;
624
639
  }
625
-
626
- private boolean needBreakInClosing() {
640
+
641
+ private boolean needBreakInClosing(Element element) {
627
642
  if (fragment) return false;
628
643
  if (format || asBuilder) return true;
629
644
  return false;
630
645
  }
631
646
 
647
+ private boolean containsText(Element element) {
648
+ return (element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.TEXT_NODE);
649
+ }
650
+
632
651
  public boolean enter(Entity entity) {
633
652
  String name = entity.getNodeName();
634
653
  String pubId = entity.getPublicId();
@@ -653,7 +672,7 @@ public class SaveContextVisitor {
653
672
  buffer.append(">");
654
673
  return true;
655
674
  }
656
-
675
+
657
676
  public void leave(Entity entity) {
658
677
  // no-op
659
678
  }
@@ -663,11 +682,11 @@ public class SaveContextVisitor {
663
682
  buffer.append("&" + name + ";");
664
683
  return true;
665
684
  }
666
-
685
+
667
686
  public void leaveEntityReference(Text entityRef) {
668
687
  // no-op
669
688
  }
670
-
689
+
671
690
  public boolean enter(Notation notation) {
672
691
  String name = notation.getNodeName();
673
692
  String pubId = notation.getPublicId();
@@ -691,7 +710,7 @@ public class SaveContextVisitor {
691
710
  buffer.append(">");
692
711
  return true;
693
712
  }
694
-
713
+
695
714
  public void leave(Notation notation) {
696
715
  // no-op
697
716
  }
@@ -707,7 +726,7 @@ public class SaveContextVisitor {
707
726
  if (canonical) c14nNodeList.add(pi);
708
727
  return true;
709
728
  }
710
-
729
+
711
730
  public void leave(ProcessingInstruction pi) {
712
731
  // no-op
713
732
  }
@@ -722,14 +741,8 @@ public class SaveContextVisitor {
722
741
  return true;
723
742
  }
724
743
  }
725
- if (needIndentText() && "".equals(textContent.trim())) return true;
726
- if (needIndentText()) {
727
- String current = indentation.peek();
728
- buffer.append(current);
729
- indentation.push(current + indentString);
730
- if (textContent.charAt(0) == lineSeparator) textContent = textContent.substring(1);
731
- }
732
- if (text.getUserData(NokogiriHelpers.ENCODED_STRING) == null || !((Boolean)text.getUserData(NokogiriHelpers.ENCODED_STRING))) {
744
+
745
+ if (NokogiriHelpers.shouldEncode(text)) {
733
746
  textContent = encodeJavaString(textContent);
734
747
  }
735
748
 
@@ -739,29 +752,13 @@ public class SaveContextVisitor {
739
752
  buffer.append(textContent);
740
753
  return true;
741
754
  }
742
-
743
- private boolean needIndentText() {
744
- if (fragment) return false;
745
- if (format) return true;
746
- return false;
747
- }
748
-
749
- public void leave(Text text) {
750
- String textContent = text.getNodeValue();
751
- if (needIndentText() && !"".equals(textContent.trim())) {
752
- indentation.pop();
753
- if (textContent.charAt(textContent.length()-1) != lineSeparator) {
754
- buffer.append("\n");
755
- }
756
- }
757
- }
758
-
755
+
759
756
  private String getEncoding(Text text) {
760
757
  if (encoding != null) return encoding;
761
758
  encoding = text.getOwnerDocument().getInputEncoding();
762
759
  return encoding;
763
760
  }
764
-
761
+
765
762
  private String encodeStringToHtmlEntity(String text) {
766
763
  int last = 126; // = U+007E. No need to encode under U+007E.
767
764
  StringBuffer sb = new StringBuffer();