coradoc 1.1.2 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6e77fc4eb6d79071cb1d309530adbd49a4364265b446d139c18eac2da3bce54
4
- data.tar.gz: 07a84837196bde12cc9d91ccd5084b8a173a4da5c237eeaea06a6d786c2f876c
3
+ metadata.gz: 4f515fbc05baa87f58f84a59737c9818603c6e9f0fc8835cdeb9bd6be9eb39e4
4
+ data.tar.gz: ce51ff395a3dfb4bf77c37f6b54fe745a14a7262911df50479088d43392d4927
5
5
  SHA512:
6
- metadata.gz: 69d2d12389a4e254b5ee8f4d67d3ddd406abeabebe1874b343acb4cd7c560c35284c8a58a4d2ee58cc788cfafd68989e88e783aa79fa3c32dc0f9dd389a276ac
7
- data.tar.gz: 7f034ef9a649446198793d8d8221a1bba5e22069f2feec85c5d30529170f8a87315540f481bc8b1219c03aaba78e42de67bbf94e25fd3e4ab93b6f221a356ca4
6
+ metadata.gz: 3269512745aea59b9780e5df2d872af7c3f19851666868335bc4876a534e50c64df410c5257a335324e9a9e0ad573dacf00058752ee00fd4b830f2f150cadeb2
7
+ data.tar.gz: 00e4122ec5e234e8e7e54d3e5769e03df15ad0f9851059576dadde7de58c0e2dd87f3c18ab14283574347bda6c95536cf31ee3ce6c202ea8b4c3f0da96f3f399
@@ -12,6 +12,16 @@ module Coradoc
12
12
  @rejected_named = []
13
13
  end
14
14
 
15
+ def inspect
16
+ "AttributeList: " +
17
+ [
18
+ @positional.map(&:inspect).join(", "),
19
+ @named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "),
20
+ (@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"),
21
+ (@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"),
22
+ ].reject { |i| i == true || i.empty? }.join(", ")
23
+ end
24
+
15
25
  def add_positional(*attr)
16
26
  @positional += attr
17
27
  end
@@ -65,7 +75,9 @@ module Coradoc
65
75
 
66
76
  adoc = +""
67
77
  if !@positional.empty?
68
- adoc << @positional.map { |p| [nil, ""].include?(p) ? '""' : p }.join(",")
78
+ adoc << @positional.map do |p|
79
+ [nil, ""].include?(p) ? '""' : p
80
+ end.join(",")
69
81
  end
70
82
  adoc << "," if @positional.any? && @named.any?
71
83
  adoc << @named.map do |k, v|
@@ -14,6 +14,8 @@ module Coradoc
14
14
  when Coradoc::Element::Section
15
15
  return content unless i.safe_to_collapse?
16
16
 
17
+ collected_content << i.anchor if i.anchor
18
+
17
19
  simplified = simplify_block_content(i.contents)
18
20
 
19
21
  if simplified && !simplified.empty?
@@ -1,7 +1,7 @@
1
1
  module Coradoc
2
2
  module Element
3
3
  class Section < Base
4
- attr_accessor :id, :title, :attrs, :contents, :sections
4
+ attr_accessor :id, :title, :attrs, :contents, :sections, :anchor
5
5
 
6
6
  declare_children :id, :title, :contents, :sections
7
7
 
@@ -49,7 +49,7 @@ module Coradoc
49
49
  # HTML element and if it happens inside some other block element, can be
50
50
  # safely collapsed.
51
51
  def safe_to_collapse?
52
- @title.nil? && @id.nil? && @sections.empty?
52
+ @title.nil? && @sections.empty?
53
53
  end
54
54
 
55
55
  private
@@ -15,6 +15,15 @@ module Coradoc
15
15
  end
16
16
  end
17
17
 
18
+ def inspect
19
+ str = "TextElement"
20
+ str += "(#{@id})" if @id
21
+ str += ": "
22
+ str += @content.inspect
23
+ str += " + #{@line_break.inspect}" unless line_break.empty?
24
+ str
25
+ end
26
+
18
27
  def to_adoc
19
28
  Coradoc::Generator.gen_adoc(@content) + @line_break
20
29
  end
@@ -72,14 +72,14 @@ module Coradoc::Input::HTML
72
72
  leading_whitespace = $1
73
73
  if !leading_whitespace.nil?
74
74
  first_text = node.at_xpath("./text()[1]")
75
- first_text.replace(first_text.text.lstrip)
75
+ first_text.replace(first_text.text.lstrip) if first_text
76
76
  leading_whitespace = " "
77
77
  end
78
78
  node.text =~ /(\s+)$/
79
79
  trailing_whitespace = $1
80
80
  if !trailing_whitespace.nil?
81
81
  last_text = node.at_xpath("./text()[last()]")
82
- last_text.replace(last_text.text.rstrip)
82
+ last_text.replace(last_text.text.rstrip) if last_text
83
83
  trailing_whitespace = " "
84
84
  end
85
85
  [leading_whitespace, trailing_whitespace]
@@ -10,5 +10,6 @@ module Coradoc::Input::HTML
10
10
 
11
11
  register :div, Div.new
12
12
  register :article, Div.new
13
+ register :center, Div.new
13
14
  end
14
15
  end
@@ -4,6 +4,8 @@ module Coradoc::Input::HTML
4
4
  # is compatible with what we would get out of Coradoc, if
5
5
  # it parsed it directly.
6
6
  class Postprocessor
7
+ Element = Coradoc::Element
8
+
7
9
  def self.process(coradoc)
8
10
  new(coradoc).process
9
11
  end
@@ -12,17 +14,74 @@ module Coradoc::Input::HTML
12
14
  @tree = coradoc
13
15
  end
14
16
 
17
+ # Extracts titles from lists. This happens in HTML files
18
+ # generated from DOCX documents by LibreOffice.
19
+ #
20
+ # We are interested in a particular tree:
21
+ # Element::List::Ordered items:
22
+ # Element::List::Ordered items: (any depth)
23
+ # Element::ListItem content:
24
+ # Element::Title
25
+ # (any number of other titles of the same scheme)
26
+ #
27
+ # This tree is flattened into:
28
+ # Element::Title
29
+ # Element::Title (any number of titles)
30
+ def extract_titles_from_lists
31
+ @tree = Element::Base.visit(@tree) do |elem, dir|
32
+ next elem unless dir == :pre
33
+ next elem unless elem.is_a?(Element::List::Ordered)
34
+ next elem if elem.items.length != 1
35
+
36
+ anchors = []
37
+ anchors << elem.anchor if elem.anchor
38
+
39
+ # Extract ListItem from any depth of List::Ordered
40
+ processed = elem
41
+ while processed.is_a?(Element::List::Ordered)
42
+ if processed.items.length != 1
43
+ backtrack = true
44
+ break
45
+ end
46
+ anchors << processed.anchor if processed.anchor
47
+ processed = processed.items.first
48
+ end
49
+
50
+ # Something went wrong? Anything not matching on the way?
51
+ next elem if backtrack
52
+ next elem unless processed.is_a?(Element::ListItem)
53
+
54
+ anchors << processed.anchor if processed.anchor
55
+
56
+ # Now we must have a title (or titles).
57
+ titles = processed.content.flatten
58
+
59
+ # Don't bother if there's no title in there.
60
+ next elem unless titles.any? { |i| i.is_a? Element::Title }
61
+
62
+ # Ordered is another iteration for our cleanup.
63
+ next elem unless titles.all? do |i|
64
+ i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
65
+ end
66
+
67
+ # We are done now.
68
+ titles + anchors
69
+ end
70
+ end
71
+
15
72
  # Collapse DIVs that only have a title, or nest another DIV.
16
73
  def collapse_meaningless_sections
17
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, _dir|
18
- if elem.is_a?(Coradoc::Element::Section) && elem.safe_to_collapse?
74
+ @tree = Element::Base.visit(@tree) do |elem, _dir|
75
+ if elem.is_a?(Element::Section) && elem.safe_to_collapse?
19
76
  children_classes = Array(elem.contents).map(&:class)
20
77
  count = children_classes.length
21
- safe_classes = [Coradoc::Element::Section, Coradoc::Element::Title]
78
+ safe_classes = [Element::Section, Element::Title]
22
79
 
23
80
  # Count > 0 because some documents use <div> as a <br>.
24
81
  if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
25
- next elem.contents
82
+ contents = elem.contents.dup
83
+ contents.prepend(elem.anchor) if elem.anchor
84
+ next contents
26
85
  end
27
86
  end
28
87
  elem
@@ -32,12 +91,14 @@ module Coradoc::Input::HTML
32
91
  # tree should now be more cleaned up, so we can progress with
33
92
  # creating meaningful sections
34
93
  def generate_meaningful_sections
35
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
94
+ @tree = Element::Base.visit(@tree) do |elem, dir|
36
95
  # We are searching for an array, that has a title. This
37
96
  # will be a candidate for our section array.
38
97
  if dir == :post &&
39
98
  elem.is_a?(Array) &&
40
- !elem.grep(Coradoc::Element::Title).empty?
99
+ !elem.flatten.grep(Element::Title).empty?
100
+
101
+ elem = elem.flatten
41
102
 
42
103
  new_array = []
43
104
  content_array = new_array
@@ -47,12 +108,12 @@ module Coradoc::Input::HTML
47
108
  # all descendant sections into those sections. Otherwise, we push
48
109
  # an element as content of current section.
49
110
  elem.each do |e|
50
- if e.is_a? Coradoc::Element::Title
111
+ if e.is_a? Element::Title
51
112
  title = e
52
113
  content_array = []
53
114
  section_array = []
54
115
  level = title.level_int
55
- section = Coradoc::Element::Section.new(
116
+ section = Element::Section.new(
56
117
  title, contents: content_array, sections: section_array
57
118
  )
58
119
  # Some documents may not be consistent and eg. follow H4 after
@@ -82,11 +143,11 @@ module Coradoc::Input::HTML
82
143
  previous_sections = {}
83
144
 
84
145
  determine_section_id = ->(elem) do
85
- if elem.title.style == "appendix"
86
- level = "A"
87
- else
88
- level = 1
89
- end
146
+ level = if elem.title.style == "appendix"
147
+ "A"
148
+ else
149
+ 1
150
+ end
90
151
 
91
152
  section = previous_sections[elem]
92
153
  while section
@@ -102,8 +163,8 @@ module Coradoc::Input::HTML
102
163
  style
103
164
  end
104
165
 
105
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
106
- title = elem.title if elem.is_a?(Coradoc::Element::Section)
166
+ @tree = Element::Base.visit(@tree) do |elem, dir|
167
+ title = elem.title if elem.is_a?(Element::Section)
107
168
 
108
169
  if title && title.level_int <= max_level
109
170
  if dir == :pre
@@ -137,6 +198,7 @@ module Coradoc::Input::HTML
137
198
  end
138
199
 
139
200
  def process
201
+ extract_titles_from_lists
140
202
  collapse_meaningless_sections
141
203
  generate_meaningful_sections
142
204
  # Do it again to simplify the document further.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coradoc
4
- VERSION = "1.1.2"
4
+ VERSION = "1.1.3"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coradoc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2024-09-18 00:00:00.000000000 Z
12
+ date: 2024-11-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marcel