canon 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +96 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +95 -2
- data/lib/canon/comparison/html_comparator.rb +96 -11
- data/lib/canon/comparison/markup_comparator.rb +68 -71
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +174 -7
- data/lib/canon/comparison/xml_node_comparison.rb +48 -66
- data/lib/canon/comparison.rb +143 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +55 -41
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +128 -175
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +23 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +29 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +9 -2
data/lib/canon/diff_formatter.rb
CHANGED
|
@@ -7,6 +7,9 @@ require_relative "diff/diff_block"
|
|
|
7
7
|
require_relative "diff/diff_context"
|
|
8
8
|
require_relative "diff/diff_report"
|
|
9
9
|
require_relative "diff_formatter/debug_output"
|
|
10
|
+
require_relative "diff_formatter/by_line_formatter"
|
|
11
|
+
require_relative "diff_formatter/by_object_formatter"
|
|
12
|
+
require_relative "diff_formatter/pretty_diff_formatter"
|
|
10
13
|
|
|
11
14
|
module Canon
|
|
12
15
|
# Formatter for displaying semantic differences with color support
|
|
@@ -301,17 +304,26 @@ module Canon
|
|
|
301
304
|
# @param html_version [Symbol, nil] HTML version (:html4 or :html5)
|
|
302
305
|
# @return [String] Formatted output
|
|
303
306
|
def format(differences, format, doc1: nil, doc2: nil, html_version: nil)
|
|
304
|
-
# In by-line mode, always use by-line diff
|
|
307
|
+
# In by-line mode with both docs present, always use by-line diff
|
|
305
308
|
if @mode == :by_line && doc1 && doc2
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
+
doc1, doc2 = apply_display_preprocessing(doc1, doc2, format)
|
|
310
|
+
# rubocop:disable Layout/HashAlignment
|
|
311
|
+
return by_line_formatter.format(doc1, doc2, format: format,
|
|
312
|
+
html_version: html_version,
|
|
313
|
+
differences: differences)
|
|
314
|
+
# rubocop:enable Layout/HashAlignment
|
|
309
315
|
end
|
|
310
316
|
|
|
311
317
|
# In pretty_diff mode, always use text-LCS diff (bypasses DiffNodeMapper).
|
|
312
|
-
# pretty_diff_format handles nil doc1/doc2 itself (emits header only).
|
|
313
318
|
if @mode == :pretty_diff
|
|
314
|
-
|
|
319
|
+
d1, d2 = if doc1 && doc2
|
|
320
|
+
apply_display_preprocessing(doc1, doc2,
|
|
321
|
+
format)
|
|
322
|
+
else
|
|
323
|
+
[doc1,
|
|
324
|
+
doc2]
|
|
325
|
+
end
|
|
326
|
+
return pretty_diff_formatter.format(d1, d2, format: format)
|
|
315
327
|
end
|
|
316
328
|
|
|
317
329
|
no_diffs = if differences.respond_to?(:equivalent?)
|
|
@@ -323,12 +335,26 @@ module Canon
|
|
|
323
335
|
|
|
324
336
|
case @mode
|
|
325
337
|
when :by_line
|
|
326
|
-
|
|
327
|
-
|
|
338
|
+
if doc1 && doc2
|
|
339
|
+
doc1, doc2 = apply_display_preprocessing(doc1, doc2,
|
|
340
|
+
format)
|
|
341
|
+
end
|
|
342
|
+
# rubocop:disable Layout/HashAlignment
|
|
343
|
+
by_line_formatter.format(doc1, doc2, format: format,
|
|
344
|
+
html_version: html_version,
|
|
345
|
+
differences: differences)
|
|
346
|
+
# rubocop:enable Layout/HashAlignment
|
|
328
347
|
when :pretty_diff
|
|
329
|
-
|
|
348
|
+
d1, d2 = if doc1 && doc2
|
|
349
|
+
apply_display_preprocessing(doc1, doc2,
|
|
350
|
+
format)
|
|
351
|
+
else
|
|
352
|
+
[doc1,
|
|
353
|
+
doc2]
|
|
354
|
+
end
|
|
355
|
+
pretty_diff_formatter.format(d1, d2, format: format)
|
|
330
356
|
else
|
|
331
|
-
|
|
357
|
+
by_object_formatter.format(differences, format)
|
|
332
358
|
end
|
|
333
359
|
end
|
|
334
360
|
|
|
@@ -340,8 +366,13 @@ module Canon
|
|
|
340
366
|
# @param actual [Object] Actual value
|
|
341
367
|
# @return [String] Formatted diff output
|
|
342
368
|
def format_comparison_result(comparison_result, expected, actual)
|
|
343
|
-
#
|
|
344
|
-
|
|
369
|
+
# Prefer the matcher-supplied format (e.g. :html4 from
|
|
370
|
+
# be_html4_equivalent_to). Auto-detection from the expected string
|
|
371
|
+
# cannot distinguish HTML from XML for fragments like
|
|
372
|
+
# `<div class="x"></div>` and would mis-route HTML fixtures
|
|
373
|
+
# through the XML pretty-printer (issue #135).
|
|
374
|
+
format = (comparison_result.is_a?(Canon::Comparison::ComparisonResult) && comparison_result.format) ||
|
|
375
|
+
Canon::Comparison::FormatDetector.detect(expected)
|
|
345
376
|
|
|
346
377
|
formatter_options = {
|
|
347
378
|
use_color: @use_color,
|
|
@@ -366,6 +397,18 @@ module Canon
|
|
|
366
397
|
output << "" # Blank line for spacing
|
|
367
398
|
end
|
|
368
399
|
|
|
400
|
+
# Parse-error banner. When libxml flagged any errors during
|
|
401
|
+
# parsing, surface them at the top of the report so the user
|
|
402
|
+
# is not left chasing diffs that describe a partial tree.
|
|
403
|
+
# See lutaml/canon#130.
|
|
404
|
+
if comparison_result.is_a?(Canon::Comparison::ComparisonResult) &&
|
|
405
|
+
comparison_result.parse_errors?
|
|
406
|
+
output << format_parse_error_banner(
|
|
407
|
+
comparison_result.parse_errors_expected,
|
|
408
|
+
comparison_result.parse_errors_received,
|
|
409
|
+
)
|
|
410
|
+
end
|
|
411
|
+
|
|
369
412
|
# 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
|
|
370
413
|
verbose_tables = DebugOutput.verbose_tables_only(
|
|
371
414
|
comparison_result,
|
|
@@ -481,6 +524,53 @@ module Canon
|
|
|
481
524
|
|
|
482
525
|
private
|
|
483
526
|
|
|
527
|
+
# Render the parse-error banner that appears at the top of the
|
|
528
|
+
# diff report when libxml flagged any errors during parsing.
|
|
529
|
+
# Names the offending side(s) and warns that the diff below
|
|
530
|
+
# describes the parsed tree, not the input. See lutaml/canon#130.
|
|
531
|
+
#
|
|
532
|
+
# @param errors_expected [Array<String>] Errors from the expected side
|
|
533
|
+
# @param errors_received [Array<String>] Errors from the received side
|
|
534
|
+
# @return [String] Multi-line banner
|
|
535
|
+
def format_parse_error_banner(errors_expected, errors_received)
|
|
536
|
+
lines = []
|
|
537
|
+
rule = "=" * 70
|
|
538
|
+
lines << colorize(rule, :yellow, :bold)
|
|
539
|
+
lines << colorize(" ⚠️ PARSE ERRORS", :yellow, :bold)
|
|
540
|
+
lines << colorize(rule, :yellow, :bold)
|
|
541
|
+
|
|
542
|
+
if errors_expected.any?
|
|
543
|
+
lines << colorize(" Expected side:", :yellow, :bold)
|
|
544
|
+
errors_expected.each do |err|
|
|
545
|
+
lines << " #{colorize(err, :red)}"
|
|
546
|
+
end
|
|
547
|
+
end
|
|
548
|
+
|
|
549
|
+
if errors_received.any?
|
|
550
|
+
lines << colorize(" Received side:", :yellow, :bold)
|
|
551
|
+
errors_received.each do |err|
|
|
552
|
+
lines << " #{colorize(err, :red)}"
|
|
553
|
+
end
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
lines << ""
|
|
557
|
+
lines << colorize(
|
|
558
|
+
" ⚠️ The diff below describes the parsed tree, not the input.",
|
|
559
|
+
:yellow,
|
|
560
|
+
)
|
|
561
|
+
lines << colorize(
|
|
562
|
+
" Content that the parser could not represent has been",
|
|
563
|
+
:yellow,
|
|
564
|
+
)
|
|
565
|
+
lines << colorize(
|
|
566
|
+
" dropped and may appear as \"missing\" in the report.",
|
|
567
|
+
:yellow,
|
|
568
|
+
)
|
|
569
|
+
lines << colorize(rule, :yellow, :bold)
|
|
570
|
+
lines << ""
|
|
571
|
+
lines.join("\n")
|
|
572
|
+
end
|
|
573
|
+
|
|
484
574
|
# Normalize content for display in diffs
|
|
485
575
|
#
|
|
486
576
|
# @param expected [Object] Expected value
|
|
@@ -685,10 +775,8 @@ module Canon
|
|
|
685
775
|
# false disables all visualization
|
|
686
776
|
return {} if character_visualization == false
|
|
687
777
|
|
|
688
|
-
# :content_only
|
|
689
|
-
#
|
|
690
|
-
# keeping structural indentation whitespace plain.
|
|
691
|
-
# See docs/features/diff-formatting/character-visualization.adoc
|
|
778
|
+
# :content_only builds the full map; the by_line formatter applies
|
|
779
|
+
# it only to content portions, leaving structural indentation plain.
|
|
692
780
|
|
|
693
781
|
return visualization_map if visualization_map
|
|
694
782
|
|
|
@@ -723,177 +811,37 @@ module Canon
|
|
|
723
811
|
colorize("#{emoji}#{message}\n", :green, :bold)
|
|
724
812
|
end
|
|
725
813
|
|
|
726
|
-
#
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
# Extract differences array from ComparisonResult if needed
|
|
733
|
-
diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
|
|
734
|
-
differences.differences
|
|
735
|
-
else
|
|
736
|
-
differences
|
|
737
|
-
end
|
|
738
|
-
|
|
739
|
-
# Delegate to format-specific formatter
|
|
740
|
-
formatter = ByObject::BaseFormatter.for_format(
|
|
741
|
-
format,
|
|
814
|
+
# Factory methods for mode-specific formatters
|
|
815
|
+
|
|
816
|
+
# @return [ByLineFormatter]
|
|
817
|
+
def by_line_formatter
|
|
818
|
+
@by_line_formatter ||= ByLineFormatter.new(
|
|
742
819
|
use_color: @use_color,
|
|
743
820
|
visualization_map: @visualization_map,
|
|
821
|
+
context_lines: @context_lines,
|
|
822
|
+
diff_grouping_lines: @diff_grouping_lines,
|
|
744
823
|
show_diffs: @show_diffs,
|
|
824
|
+
character_visualization: @character_visualization,
|
|
825
|
+
legacy_terminal: @legacy_terminal,
|
|
826
|
+
diff_mode: @diff_mode,
|
|
745
827
|
)
|
|
746
|
-
|
|
747
|
-
output << formatter.format(diffs_array, format)
|
|
748
|
-
|
|
749
|
-
output.join("\n")
|
|
750
828
|
end
|
|
751
829
|
|
|
752
|
-
#
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
differences: [])
|
|
756
|
-
# For HTML format, use html_version if provided, otherwise default to :html4
|
|
757
|
-
if format == :html && html_version
|
|
758
|
-
format = html_version # Use :html4 or :html5
|
|
759
|
-
end
|
|
760
|
-
|
|
761
|
-
# Format display name for header
|
|
762
|
-
format_name = format.to_s.upcase
|
|
763
|
-
|
|
764
|
-
output = []
|
|
765
|
-
output << colorize("Line-by-line diff (#{format_name} mode):", :cyan,
|
|
766
|
-
:bold)
|
|
767
|
-
|
|
768
|
-
return output.join("\n") if doc1.nil? || doc2.nil?
|
|
769
|
-
|
|
770
|
-
# Apply display preprocessing (format both sides identically before diff)
|
|
771
|
-
doc1, doc2 = apply_display_preprocessing(doc1, doc2, format)
|
|
772
|
-
# Extract differences array and equivalent status from ComparisonResult if needed
|
|
773
|
-
diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
|
|
774
|
-
@comparison_equivalent = differences.equivalent?
|
|
775
|
-
differences.differences
|
|
776
|
-
else
|
|
777
|
-
@comparison_equivalent = nil
|
|
778
|
-
differences
|
|
779
|
-
end
|
|
780
|
-
|
|
781
|
-
# Delegate to format-specific formatter
|
|
782
|
-
formatter = ByLine::BaseFormatter.for_format(
|
|
783
|
-
format,
|
|
830
|
+
# @return [ByObjectFormatter]
|
|
831
|
+
def by_object_formatter
|
|
832
|
+
@by_object_formatter ||= ByObjectFormatter.new(
|
|
784
833
|
use_color: @use_color,
|
|
785
|
-
context_lines: @context_lines,
|
|
786
|
-
diff_grouping_lines: @diff_grouping_lines,
|
|
787
834
|
visualization_map: @visualization_map,
|
|
788
835
|
show_diffs: @show_diffs,
|
|
789
|
-
differences: diffs_array,
|
|
790
|
-
diff_mode: @legacy_terminal ? :separate : @diff_mode,
|
|
791
|
-
legacy_terminal: @legacy_terminal,
|
|
792
|
-
equivalent: @comparison_equivalent,
|
|
793
836
|
)
|
|
794
|
-
|
|
795
|
-
output << formatter.format(doc1, doc2)
|
|
796
|
-
|
|
797
|
-
output.join("\n")
|
|
798
|
-
end
|
|
799
|
-
|
|
800
|
-
# Generate a text-LCS diff against preprocessed lines (pretty_diff mode).
|
|
801
|
-
#
|
|
802
|
-
# This mode bypasses DiffNodeMapper entirely: it applies display_preprocessing
|
|
803
|
-
# to both sides, then runs Diff::LCS.sdiff on the resulting plain-text lines.
|
|
804
|
-
# It is a reliable short-term workaround for #85 (normative changes invisible
|
|
805
|
-
# in :by_line mode when DiffNodeMapper's DOM-address correlation is off).
|
|
806
|
-
#
|
|
807
|
-
# Limitations:
|
|
808
|
-
# - show_diffs :normative / :informative filter is ignored (no DiffNodes)
|
|
809
|
-
# - No inline character highlighting (whole-line granularity only)
|
|
810
|
-
#
|
|
811
|
-
# @param doc1 [String] First document
|
|
812
|
-
# @param doc2 [String] Second document
|
|
813
|
-
# @param format [Symbol] Document format
|
|
814
|
-
# @return [String] Formatted diff output
|
|
815
|
-
def pretty_diff_format(doc1, doc2, format:)
|
|
816
|
-
require "diff/lcs"
|
|
817
|
-
|
|
818
|
-
resolved_format = format
|
|
819
|
-
|
|
820
|
-
format_name = resolved_format.to_s.upcase
|
|
821
|
-
output = []
|
|
822
|
-
output << colorize("Pretty diff (#{format_name} mode):", :cyan, :bold)
|
|
823
|
-
|
|
824
|
-
return output.join("\n") if doc1.nil? || doc2.nil?
|
|
825
|
-
|
|
826
|
-
# Apply display preprocessing — same transforms as by_line_diff
|
|
827
|
-
d1, d2 = apply_display_preprocessing(doc1, doc2, resolved_format)
|
|
828
|
-
|
|
829
|
-
lines1 = d1.lines.map(&:chomp)
|
|
830
|
-
lines2 = d2.lines.map(&:chomp)
|
|
831
|
-
|
|
832
|
-
hunks = ::Diff::LCS.sdiff(lines1, lines2)
|
|
833
|
-
|
|
834
|
-
output << render_pretty_diff(hunks)
|
|
835
|
-
output.join("\n")
|
|
836
837
|
end
|
|
837
838
|
|
|
838
|
-
#
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
# @param hunks [Array<Diff::LCS::ContextChange>] Output of Diff::LCS.sdiff
|
|
845
|
-
# @return [String] Rendered diff lines joined with "\n"
|
|
846
|
-
def render_pretty_diff(hunks)
|
|
847
|
-
# Identify positions of changed hunks
|
|
848
|
-
changed = hunks.each_index.reject { |i| hunks[i].action == "=" }
|
|
849
|
-
|
|
850
|
-
return colorize(" (no differences)", :green) if changed.empty?
|
|
851
|
-
|
|
852
|
-
ctx = [@context_lines || 3, 0].max
|
|
853
|
-
|
|
854
|
-
# Build expanded windows, then merge overlapping/adjacent ones
|
|
855
|
-
windows = changed.map do |pos|
|
|
856
|
-
[
|
|
857
|
-
[pos - ctx, 0].max,
|
|
858
|
-
[pos + ctx, hunks.length - 1].min,
|
|
859
|
-
]
|
|
860
|
-
end
|
|
861
|
-
|
|
862
|
-
merged = []
|
|
863
|
-
windows.each do |lo, hi|
|
|
864
|
-
if merged.empty? || lo > merged.last[1] + 1
|
|
865
|
-
merged << [lo, hi]
|
|
866
|
-
else
|
|
867
|
-
merged.last[1] = [merged.last[1], hi].max
|
|
868
|
-
end
|
|
869
|
-
end
|
|
870
|
-
|
|
871
|
-
lines = []
|
|
872
|
-
merged.each_with_index do |(lo, hi), block_idx|
|
|
873
|
-
# Separator between non-adjacent blocks
|
|
874
|
-
if block_idx.positive?
|
|
875
|
-
lines << colorize("--- ---", :cyan)
|
|
876
|
-
elsif lo.positive?
|
|
877
|
-
lines << colorize("--- ---", :cyan)
|
|
878
|
-
end
|
|
879
|
-
|
|
880
|
-
(lo..hi).each do |i|
|
|
881
|
-
hunk = hunks[i]
|
|
882
|
-
case hunk.action
|
|
883
|
-
when "="
|
|
884
|
-
lines << (@use_color ? "\e[0m #{hunk.old_element}" : " #{hunk.old_element}")
|
|
885
|
-
when "-"
|
|
886
|
-
lines << colorize("- #{hunk.old_element}", :red)
|
|
887
|
-
when "+"
|
|
888
|
-
lines << colorize("+ #{hunk.new_element}", :green)
|
|
889
|
-
when "!"
|
|
890
|
-
lines << colorize("- #{hunk.old_element}", :red)
|
|
891
|
-
lines << colorize("+ #{hunk.new_element}", :green)
|
|
892
|
-
end
|
|
893
|
-
end
|
|
894
|
-
end
|
|
895
|
-
|
|
896
|
-
lines.join("\n")
|
|
839
|
+
# @return [PrettyDiffFormatter]
|
|
840
|
+
def pretty_diff_formatter
|
|
841
|
+
@pretty_diff_formatter ||= PrettyDiffFormatter.new(
|
|
842
|
+
use_color: @use_color,
|
|
843
|
+
context_lines: @context_lines,
|
|
844
|
+
)
|
|
897
845
|
end
|
|
898
846
|
|
|
899
847
|
# Apply display preprocessing to both documents before the line diff.
|
|
@@ -966,6 +914,7 @@ differences: [])
|
|
|
966
914
|
collapse_whitespace_elements: @collapse_whitespace_elements,
|
|
967
915
|
strip_whitespace_elements: @strip_whitespace_elements,
|
|
968
916
|
sort_attributes: @pretty_printer_sort_attributes,
|
|
917
|
+
html_mode: %i[html html4 html5].include?(format),
|
|
969
918
|
}
|
|
970
919
|
|
|
971
920
|
printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
|
|
@@ -1047,9 +996,13 @@ differences: [])
|
|
|
1047
996
|
|
|
1048
997
|
if %i[html html4 html5].include?(format)
|
|
1049
998
|
require "canon/pretty_printer/html"
|
|
999
|
+
# Fixture-ready mode actually indents (libxml FORMAT save flag
|
|
1000
|
+
# via AS_XHTML). The default mode is structurally faithful but
|
|
1001
|
+
# does not indent on HTML5 input -- see lutaml/canon#133.
|
|
1050
1002
|
printer = Canon::PrettyPrinter::Html.new(
|
|
1051
1003
|
indent: @pretty_printer_indent,
|
|
1052
1004
|
indent_type: indent_type_str,
|
|
1005
|
+
fixture_ready: true,
|
|
1053
1006
|
)
|
|
1054
1007
|
elsif format == :xml
|
|
1055
1008
|
require "canon/pretty_printer/xml"
|
|
@@ -208,19 +208,25 @@ module Canon
|
|
|
208
208
|
|
|
209
209
|
# Build text node from Nokogiri text node
|
|
210
210
|
# HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)
|
|
211
|
+
# and preserves whitespace between inline element siblings.
|
|
211
212
|
def self.build_text_node(nokogiri_text)
|
|
212
213
|
# Skip text nodes that are only whitespace between elements
|
|
213
214
|
# EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
|
|
214
|
-
#
|
|
215
|
+
# and when whitespace is between inline element siblings (semantically significant)
|
|
215
216
|
content = nokogiri_text.content
|
|
216
217
|
|
|
217
|
-
|
|
218
|
+
# NBSP (U+00A0) is never insignificant whitespace
|
|
219
|
+
if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element) && !content.include?("\u00A0")
|
|
218
220
|
# Check if parent is whitespace-sensitive
|
|
219
221
|
parent_name = nokogiri_text.parent.name.downcase
|
|
220
222
|
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
221
223
|
|
|
222
|
-
#
|
|
223
|
-
|
|
224
|
+
# Check if whitespace is between inline siblings
|
|
225
|
+
require_relative "../comparison/whitespace_sensitivity"
|
|
226
|
+
unless whitespace_sensitive_tags.include?(parent_name) ||
|
|
227
|
+
Canon::Comparison::WhitespaceSensitivity.inline_whitespace_significant?(nokogiri_text)
|
|
228
|
+
return nil
|
|
229
|
+
end
|
|
224
230
|
end
|
|
225
231
|
|
|
226
232
|
# Nokogiri already handles CDATA conversion and entity resolution
|
|
@@ -1,19 +1,43 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
|
+
require "stringio"
|
|
5
|
+
require_relative "html_void_elements"
|
|
4
6
|
|
|
5
7
|
module Canon
|
|
6
8
|
module PrettyPrinter
|
|
7
|
-
# Pretty printer for HTML with consistent indentation
|
|
9
|
+
# Pretty printer for HTML with consistent indentation.
|
|
10
|
+
#
|
|
11
|
+
# Two modes:
|
|
12
|
+
#
|
|
13
|
+
# 1. Default mode (+fixture_ready: false+): retains the existing
|
|
14
|
+
# behaviour for callers that use the pretty-printer as a
|
|
15
|
+
# structural normaliser (the canon round-trip tests, the
|
|
16
|
+
# diff-pipeline +apply_pretty_print+ stage, etc). These callers
|
|
17
|
+
# do not require actual indentation; they require structural
|
|
18
|
+
# equivalence to the input.
|
|
19
|
+
#
|
|
20
|
+
# 2. Fixture-ready mode (+fixture_ready: true+): emits
|
|
21
|
+
# actually-indented XHTML-shaped output via libxml's +FORMAT+
|
|
22
|
+
# save flag. Used by +DiffFormatter#prettyprint_for_display+
|
|
23
|
+
# (the +CANON_<FORMAT>_DIFF_SHOW_PRETTYPRINT_RECEIVED+ surface)
|
|
24
|
+
# so the user can read or paste the formatted output directly
|
|
25
|
+
# into a fixture heredoc. Output is XHTML-shaped (void
|
|
26
|
+
# elements self-closed, non-void paired) via the +AS_XHTML+
|
|
27
|
+
# save flag; the +NO_DECLARATION+ flag suppresses the
|
|
28
|
+
# +<?xml ...?>+ prefix.
|
|
29
|
+
#
|
|
30
|
+
# See lutaml/canon#133, lutaml/canon#135.
|
|
8
31
|
class Html
|
|
9
|
-
def initialize(indent: 2, indent_type: "space")
|
|
32
|
+
def initialize(indent: 2, indent_type: "space", fixture_ready: false)
|
|
10
33
|
@indent = indent.to_i
|
|
11
34
|
@indent_type = indent_type
|
|
35
|
+
@fixture_ready = fixture_ready
|
|
12
36
|
end
|
|
13
37
|
|
|
14
|
-
# Pretty print HTML with consistent indentation
|
|
15
38
|
def format(html_string)
|
|
16
|
-
|
|
39
|
+
return format_fixture_ready(html_string) if @fixture_ready
|
|
40
|
+
|
|
17
41
|
if xhtml?(html_string)
|
|
18
42
|
format_as_xhtml(html_string)
|
|
19
43
|
else
|
|
@@ -24,34 +48,72 @@ module Canon
|
|
|
24
48
|
private
|
|
25
49
|
|
|
26
50
|
def xhtml?(html_string)
|
|
27
|
-
# Check for XHTML DOCTYPE or xmlns attribute
|
|
28
51
|
html_string.include?("XHTML") ||
|
|
29
52
|
html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
|
|
30
53
|
end
|
|
31
54
|
|
|
32
55
|
def format_as_xhtml(html_string)
|
|
33
|
-
# Parse as XML for XHTML
|
|
34
56
|
doc = Nokogiri::XML(html_string, &:noblanks)
|
|
35
57
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
58
|
+
out = if @indent_type == "tab"
|
|
59
|
+
doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
|
|
60
|
+
else
|
|
61
|
+
doc.to_xml(indent: @indent, encoding: "UTF-8")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
expand_non_void_self_closing(out)
|
|
42
65
|
end
|
|
43
66
|
|
|
44
67
|
def format_as_html(html_string)
|
|
45
|
-
# Parse as HTML5
|
|
46
68
|
doc = Nokogiri::HTML5(html_string)
|
|
47
69
|
|
|
48
|
-
# Use Nokogiri's built-in pretty printing
|
|
49
70
|
if @indent_type == "tab"
|
|
50
71
|
doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
|
|
51
72
|
else
|
|
52
73
|
doc.to_html(indent: @indent, encoding: "UTF-8")
|
|
53
74
|
end
|
|
54
75
|
end
|
|
76
|
+
|
|
77
|
+
# Fixture-ready serialisation: parse with Nokogiri::HTML5 (so we
|
|
78
|
+
# get permissive recovery on real-world Word / XHTML5 / HTML5
|
|
79
|
+
# input shapes), then write through libxml's XML writer with
|
|
80
|
+
# +FORMAT+ + +AS_XHTML+ + +NO_DECLARATION+. +FORMAT+ inserts
|
|
81
|
+
# indentation; +AS_XHTML+ produces well-shaped output (void
|
|
82
|
+
# elements self-closed, non-void paired); +NO_DECLARATION+
|
|
83
|
+
# suppresses the +<?xml ...?>+ prefix.
|
|
84
|
+
def format_fixture_ready(html_string)
|
|
85
|
+
doc = Nokogiri::HTML5(html_string)
|
|
86
|
+
io = StringIO.new
|
|
87
|
+
if @indent_type == "tab"
|
|
88
|
+
doc.write_to(io, save_with: fixture_ready_save_options,
|
|
89
|
+
indent: 1, indent_text: "\t")
|
|
90
|
+
else
|
|
91
|
+
doc.write_to(io, save_with: fixture_ready_save_options,
|
|
92
|
+
indent: @indent)
|
|
93
|
+
end
|
|
94
|
+
io.string
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def fixture_ready_save_options
|
|
98
|
+
Nokogiri::XML::Node::SaveOptions::FORMAT |
|
|
99
|
+
Nokogiri::XML::Node::SaveOptions::AS_XHTML |
|
|
100
|
+
Nokogiri::XML::Node::SaveOptions::NO_DECLARATION
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Rewrite +<tag …/>+ into +<tag …></tag>+ for every element name
|
|
104
|
+
# that is not an HTML5 void element. +<a/>+ is illegal HTML;
|
|
105
|
+
# void tags like +<br/>+ and +<img …/>+ pass through unchanged.
|
|
106
|
+
def expand_non_void_self_closing(html)
|
|
107
|
+
html.gsub(%r{<([A-Za-z][A-Za-z0-9:_-]*)((?:\s+[^<>"]*(?:"[^"]*"[^<>"]*)*)?)/>}) do
|
|
108
|
+
name = ::Regexp.last_match(1)
|
|
109
|
+
attrs = ::Regexp.last_match(2)
|
|
110
|
+
if HtmlVoidElements.void?(name)
|
|
111
|
+
"<#{name}#{attrs}/>"
|
|
112
|
+
else
|
|
113
|
+
"<#{name}#{attrs}></#{name}>"
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
55
117
|
end
|
|
56
118
|
end
|
|
57
119
|
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module PrettyPrinter
|
|
7
|
+
# The 14 HTML5 void elements — those whose start tag may stand alone
|
|
8
|
+
# (with no end tag) and which cannot have any content. Every other
|
|
9
|
+
# element with no children must be written as +<tag></tag>+ in HTML;
|
|
10
|
+
# writing +<a/>+ is illegal HTML and is parsed as +<a>+ (start tag only).
|
|
11
|
+
module HtmlVoidElements
|
|
12
|
+
VOID = Set.new(%w[area base br col embed hr img input link meta param
|
|
13
|
+
source track wbr]).freeze
|
|
14
|
+
|
|
15
|
+
def self.void?(name)
|
|
16
|
+
VOID.include?(name.to_s.downcase)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
|
+
require_relative "html_void_elements"
|
|
4
5
|
|
|
5
6
|
module Canon
|
|
6
7
|
module PrettyPrinter
|
|
@@ -133,12 +134,14 @@ module Canon
|
|
|
133
134
|
collapse_whitespace_elements: [],
|
|
134
135
|
strip_whitespace_elements: [],
|
|
135
136
|
pretty_printed: false,
|
|
136
|
-
sort_attributes: false
|
|
137
|
+
sort_attributes: false,
|
|
138
|
+
html_mode: false)
|
|
137
139
|
@indent = indent.to_i
|
|
138
140
|
@indent_char = indent_type == "tab" ? "\t" : " "
|
|
139
141
|
@vis_map = visualization_map || default_vis_map
|
|
140
142
|
@pretty_printed = pretty_printed
|
|
141
143
|
@sort_attributes = sort_attributes
|
|
144
|
+
@html_mode = html_mode
|
|
142
145
|
|
|
143
146
|
@strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
|
|
144
147
|
@norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
|
|
@@ -151,10 +154,10 @@ module Canon
|
|
|
151
154
|
# @return [String] Serialized XML, one node per line, with content
|
|
152
155
|
# whitespace visualized at line boundaries
|
|
153
156
|
def format(xml_string)
|
|
154
|
-
doc = Nokogiri::XML(xml_string)
|
|
157
|
+
doc = @html_mode ? Nokogiri::HTML5(xml_string) : Nokogiri::XML(xml_string)
|
|
155
158
|
lines = []
|
|
156
159
|
|
|
157
|
-
if doc.version
|
|
160
|
+
if !@html_mode && doc.version
|
|
158
161
|
enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
|
|
159
162
|
lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
|
|
160
163
|
end
|
|
@@ -198,6 +201,10 @@ module Canon
|
|
|
198
201
|
children = node.children.reject { |c| c.text? && c.content.empty? }
|
|
199
202
|
|
|
200
203
|
if children.empty?
|
|
204
|
+
if @html_mode && !HtmlVoidElements.void?(node.name)
|
|
205
|
+
return "#{ind(depth)}#{open_tag(node)}</#{node.name}>"
|
|
206
|
+
end
|
|
207
|
+
|
|
201
208
|
return "#{ind(depth)}#{open_tag(node,
|
|
202
209
|
self_close: true)}"
|
|
203
210
|
end
|