coradoc 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6e77fc4eb6d79071cb1d309530adbd49a4364265b446d139c18eac2da3bce54
4
- data.tar.gz: 07a84837196bde12cc9d91ccd5084b8a173a4da5c237eeaea06a6d786c2f876c
3
+ metadata.gz: 4f515fbc05baa87f58f84a59737c9818603c6e9f0fc8835cdeb9bd6be9eb39e4
4
+ data.tar.gz: ce51ff395a3dfb4bf77c37f6b54fe745a14a7262911df50479088d43392d4927
5
5
  SHA512:
6
- metadata.gz: 69d2d12389a4e254b5ee8f4d67d3ddd406abeabebe1874b343acb4cd7c560c35284c8a58a4d2ee58cc788cfafd68989e88e783aa79fa3c32dc0f9dd389a276ac
7
- data.tar.gz: 7f034ef9a649446198793d8d8221a1bba5e22069f2feec85c5d30529170f8a87315540f481bc8b1219c03aaba78e42de67bbf94e25fd3e4ab93b6f221a356ca4
6
+ metadata.gz: 3269512745aea59b9780e5df2d872af7c3f19851666868335bc4876a534e50c64df410c5257a335324e9a9e0ad573dacf00058752ee00fd4b830f2f150cadeb2
7
+ data.tar.gz: 00e4122ec5e234e8e7e54d3e5769e03df15ad0f9851059576dadde7de58c0e2dd87f3c18ab14283574347bda6c95536cf31ee3ce6c202ea8b4c3f0da96f3f399
@@ -12,6 +12,16 @@ module Coradoc
12
12
  @rejected_named = []
13
13
  end
14
14
 
15
+ def inspect
16
+ "AttributeList: " +
17
+ [
18
+ @positional.map(&:inspect).join(", "),
19
+ @named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "),
20
+ (@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"),
21
+ (@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"),
22
+ ].reject { |i| i == true || i.empty? }.join(", ")
23
+ end
24
+
15
25
  def add_positional(*attr)
16
26
  @positional += attr
17
27
  end
@@ -65,7 +75,9 @@ module Coradoc
65
75
 
66
76
  adoc = +""
67
77
  if !@positional.empty?
68
- adoc << @positional.map { |p| [nil, ""].include?(p) ? '""' : p }.join(",")
78
+ adoc << @positional.map do |p|
79
+ [nil, ""].include?(p) ? '""' : p
80
+ end.join(",")
69
81
  end
70
82
  adoc << "," if @positional.any? && @named.any?
71
83
  adoc << @named.map do |k, v|
@@ -14,6 +14,8 @@ module Coradoc
14
14
  when Coradoc::Element::Section
15
15
  return content unless i.safe_to_collapse?
16
16
 
17
+ collected_content << i.anchor if i.anchor
18
+
17
19
  simplified = simplify_block_content(i.contents)
18
20
 
19
21
  if simplified && !simplified.empty?
@@ -1,7 +1,7 @@
1
1
  module Coradoc
2
2
  module Element
3
3
  class Section < Base
4
- attr_accessor :id, :title, :attrs, :contents, :sections
4
+ attr_accessor :id, :title, :attrs, :contents, :sections, :anchor
5
5
 
6
6
  declare_children :id, :title, :contents, :sections
7
7
 
@@ -49,7 +49,7 @@ module Coradoc
49
49
  # HTML element and if it happens inside some other block element, can be
50
50
  # safely collapsed.
51
51
  def safe_to_collapse?
52
- @title.nil? && @id.nil? && @sections.empty?
52
+ @title.nil? && @sections.empty?
53
53
  end
54
54
 
55
55
  private
@@ -15,6 +15,15 @@ module Coradoc
15
15
  end
16
16
  end
17
17
 
18
+ def inspect
19
+ str = "TextElement"
20
+ str += "(#{@id})" if @id
21
+ str += ": "
22
+ str += @content.inspect
23
+ str += " + #{@line_break.inspect}" unless line_break.empty?
24
+ str
25
+ end
26
+
18
27
  def to_adoc
19
28
  Coradoc::Generator.gen_adoc(@content) + @line_break
20
29
  end
@@ -72,14 +72,14 @@ module Coradoc::Input::HTML
72
72
  leading_whitespace = $1
73
73
  if !leading_whitespace.nil?
74
74
  first_text = node.at_xpath("./text()[1]")
75
- first_text.replace(first_text.text.lstrip)
75
+ first_text.replace(first_text.text.lstrip) if first_text
76
76
  leading_whitespace = " "
77
77
  end
78
78
  node.text =~ /(\s+)$/
79
79
  trailing_whitespace = $1
80
80
  if !trailing_whitespace.nil?
81
81
  last_text = node.at_xpath("./text()[last()]")
82
- last_text.replace(last_text.text.rstrip)
82
+ last_text.replace(last_text.text.rstrip) if last_text
83
83
  trailing_whitespace = " "
84
84
  end
85
85
  [leading_whitespace, trailing_whitespace]
@@ -10,5 +10,6 @@ module Coradoc::Input::HTML
10
10
 
11
11
  register :div, Div.new
12
12
  register :article, Div.new
13
+ register :center, Div.new
13
14
  end
14
15
  end
@@ -4,6 +4,8 @@ module Coradoc::Input::HTML
4
4
  # is compatible with what we would get out of Coradoc, if
5
5
  # it parsed it directly.
6
6
  class Postprocessor
7
+ Element = Coradoc::Element
8
+
7
9
  def self.process(coradoc)
8
10
  new(coradoc).process
9
11
  end
@@ -12,17 +14,74 @@ module Coradoc::Input::HTML
12
14
  @tree = coradoc
13
15
  end
14
16
 
17
+ # Extracts titles from lists. This happens in HTML files
18
+ # generated from DOCX documents by LibreOffice.
19
+ #
20
+ # We are interested in a particular tree:
21
+ # Element::List::Ordered items:
22
+ # Element::List::Ordered items: (any depth)
23
+ # Element::ListItem content:
24
+ # Element::Title
25
+ # (any number of other titles of the same scheme)
26
+ #
27
+ # This tree is flattened into:
28
+ # Element::Title
29
+ # Element::Title (any number of titles)
30
+ def extract_titles_from_lists
31
+ @tree = Element::Base.visit(@tree) do |elem, dir|
32
+ next elem unless dir == :pre
33
+ next elem unless elem.is_a?(Element::List::Ordered)
34
+ next elem if elem.items.length != 1
35
+
36
+ anchors = []
37
+ anchors << elem.anchor if elem.anchor
38
+
39
+ # Extract ListItem from any depth of List::Ordered
40
+ processed = elem
41
+ while processed.is_a?(Element::List::Ordered)
42
+ if processed.items.length != 1
43
+ backtrack = true
44
+ break
45
+ end
46
+ anchors << processed.anchor if processed.anchor
47
+ processed = processed.items.first
48
+ end
49
+
50
+ # Something went wrong? Anything not matching on the way?
51
+ next elem if backtrack
52
+ next elem unless processed.is_a?(Element::ListItem)
53
+
54
+ anchors << processed.anchor if processed.anchor
55
+
56
+ # Now we must have a title (or titles).
57
+ titles = processed.content.flatten
58
+
59
+ # Don't bother if there's no title in there.
60
+ next elem unless titles.any? { |i| i.is_a? Element::Title }
61
+
62
+ # Ordered is another iteration for our cleanup.
63
+ next elem unless titles.all? do |i|
64
+ i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
65
+ end
66
+
67
+ # We are done now.
68
+ titles + anchors
69
+ end
70
+ end
71
+
15
72
  # Collapse DIVs that only have a title, or nest another DIV.
16
73
  def collapse_meaningless_sections
17
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, _dir|
18
- if elem.is_a?(Coradoc::Element::Section) && elem.safe_to_collapse?
74
+ @tree = Element::Base.visit(@tree) do |elem, _dir|
75
+ if elem.is_a?(Element::Section) && elem.safe_to_collapse?
19
76
  children_classes = Array(elem.contents).map(&:class)
20
77
  count = children_classes.length
21
- safe_classes = [Coradoc::Element::Section, Coradoc::Element::Title]
78
+ safe_classes = [Element::Section, Element::Title]
22
79
 
23
80
  # Count > 0 because some documents use <div> as a <br>.
24
81
  if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
25
- next elem.contents
82
+ contents = elem.contents.dup
83
+ contents.prepend(elem.anchor) if elem.anchor
84
+ next contents
26
85
  end
27
86
  end
28
87
  elem
@@ -32,12 +91,14 @@ module Coradoc::Input::HTML
32
91
  # tree should now be more cleaned up, so we can progress with
33
92
  # creating meaningful sections
34
93
  def generate_meaningful_sections
35
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
94
+ @tree = Element::Base.visit(@tree) do |elem, dir|
36
95
  # We are searching for an array, that has a title. This
37
96
  # will be a candidate for our section array.
38
97
  if dir == :post &&
39
98
  elem.is_a?(Array) &&
40
- !elem.grep(Coradoc::Element::Title).empty?
99
+ !elem.flatten.grep(Element::Title).empty?
100
+
101
+ elem = elem.flatten
41
102
 
42
103
  new_array = []
43
104
  content_array = new_array
@@ -47,12 +108,12 @@ module Coradoc::Input::HTML
47
108
  # all descendant sections into those sections. Otherwise, we push
48
109
  # an element as content of current section.
49
110
  elem.each do |e|
50
- if e.is_a? Coradoc::Element::Title
111
+ if e.is_a? Element::Title
51
112
  title = e
52
113
  content_array = []
53
114
  section_array = []
54
115
  level = title.level_int
55
- section = Coradoc::Element::Section.new(
116
+ section = Element::Section.new(
56
117
  title, contents: content_array, sections: section_array
57
118
  )
58
119
  # Some documents may not be consistent and eg. follow H4 after
@@ -82,11 +143,11 @@ module Coradoc::Input::HTML
82
143
  previous_sections = {}
83
144
 
84
145
  determine_section_id = ->(elem) do
85
- if elem.title.style == "appendix"
86
- level = "A"
87
- else
88
- level = 1
89
- end
146
+ level = if elem.title.style == "appendix"
147
+ "A"
148
+ else
149
+ 1
150
+ end
90
151
 
91
152
  section = previous_sections[elem]
92
153
  while section
@@ -102,8 +163,8 @@ module Coradoc::Input::HTML
102
163
  style
103
164
  end
104
165
 
105
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
106
- title = elem.title if elem.is_a?(Coradoc::Element::Section)
166
+ @tree = Element::Base.visit(@tree) do |elem, dir|
167
+ title = elem.title if elem.is_a?(Element::Section)
107
168
 
108
169
  if title && title.level_int <= max_level
109
170
  if dir == :pre
@@ -137,6 +198,7 @@ module Coradoc::Input::HTML
137
198
  end
138
199
 
139
200
  def process
201
+ extract_titles_from_lists
140
202
  collapse_meaningless_sections
141
203
  generate_meaningful_sections
142
204
  # Do it again to simplify the document further.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coradoc
4
- VERSION = "1.1.2"
4
+ VERSION = "1.1.3"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coradoc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2024-09-18 00:00:00.000000000 Z
12
+ date: 2024-11-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marcel