coradoc 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/coradoc/element/attribute_list.rb +13 -1
- data/lib/coradoc/element/base.rb +2 -0
- data/lib/coradoc/element/section.rb +2 -2
- data/lib/coradoc/element/text_element.rb +9 -0
- data/lib/coradoc/input/html/converters/base.rb +2 -2
- data/lib/coradoc/input/html/converters/div.rb +1 -0
- data/lib/coradoc/input/html/postprocessor.rb +77 -15
- data/lib/coradoc/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f515fbc05baa87f58f84a59737c9818603c6e9f0fc8835cdeb9bd6be9eb39e4
|
4
|
+
data.tar.gz: ce51ff395a3dfb4bf77c37f6b54fe745a14a7262911df50479088d43392d4927
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3269512745aea59b9780e5df2d872af7c3f19851666868335bc4876a534e50c64df410c5257a335324e9a9e0ad573dacf00058752ee00fd4b830f2f150cadeb2
|
7
|
+
data.tar.gz: 00e4122ec5e234e8e7e54d3e5769e03df15ad0f9851059576dadde7de58c0e2dd87f3c18ab14283574347bda6c95536cf31ee3ce6c202ea8b4c3f0da96f3f399
|
@@ -12,6 +12,16 @@ module Coradoc
|
|
12
12
|
@rejected_named = []
|
13
13
|
end
|
14
14
|
|
15
|
+
def inspect
|
16
|
+
"AttributeList: " +
|
17
|
+
[
|
18
|
+
@positional.map(&:inspect).join(", "),
|
19
|
+
@named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "),
|
20
|
+
(@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"),
|
21
|
+
(@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"),
|
22
|
+
].reject { |i| i == true || i.empty? }.join(", ")
|
23
|
+
end
|
24
|
+
|
15
25
|
def add_positional(*attr)
|
16
26
|
@positional += attr
|
17
27
|
end
|
@@ -65,7 +75,9 @@ module Coradoc
|
|
65
75
|
|
66
76
|
adoc = +""
|
67
77
|
if !@positional.empty?
|
68
|
-
adoc << @positional.map
|
78
|
+
adoc << @positional.map do |p|
|
79
|
+
[nil, ""].include?(p) ? '""' : p
|
80
|
+
end.join(",")
|
69
81
|
end
|
70
82
|
adoc << "," if @positional.any? && @named.any?
|
71
83
|
adoc << @named.map do |k, v|
|
data/lib/coradoc/element/base.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Coradoc
|
2
2
|
module Element
|
3
3
|
class Section < Base
|
4
|
-
attr_accessor :id, :title, :attrs, :contents, :sections
|
4
|
+
attr_accessor :id, :title, :attrs, :contents, :sections, :anchor
|
5
5
|
|
6
6
|
declare_children :id, :title, :contents, :sections
|
7
7
|
|
@@ -49,7 +49,7 @@ module Coradoc
|
|
49
49
|
# HTML element and if it happens inside some other block element, can be
|
50
50
|
# safely collapsed.
|
51
51
|
def safe_to_collapse?
|
52
|
-
@title.nil? && @
|
52
|
+
@title.nil? && @sections.empty?
|
53
53
|
end
|
54
54
|
|
55
55
|
private
|
@@ -15,6 +15,15 @@ module Coradoc
|
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
+
def inspect
|
19
|
+
str = "TextElement"
|
20
|
+
str += "(#{@id})" if @id
|
21
|
+
str += ": "
|
22
|
+
str += @content.inspect
|
23
|
+
str += " + #{@line_break.inspect}" unless line_break.empty?
|
24
|
+
str
|
25
|
+
end
|
26
|
+
|
18
27
|
def to_adoc
|
19
28
|
Coradoc::Generator.gen_adoc(@content) + @line_break
|
20
29
|
end
|
@@ -72,14 +72,14 @@ module Coradoc::Input::HTML
|
|
72
72
|
leading_whitespace = $1
|
73
73
|
if !leading_whitespace.nil?
|
74
74
|
first_text = node.at_xpath("./text()[1]")
|
75
|
-
first_text.replace(first_text.text.lstrip)
|
75
|
+
first_text.replace(first_text.text.lstrip) if first_text
|
76
76
|
leading_whitespace = " "
|
77
77
|
end
|
78
78
|
node.text =~ /(\s+)$/
|
79
79
|
trailing_whitespace = $1
|
80
80
|
if !trailing_whitespace.nil?
|
81
81
|
last_text = node.at_xpath("./text()[last()]")
|
82
|
-
last_text.replace(last_text.text.rstrip)
|
82
|
+
last_text.replace(last_text.text.rstrip) if last_text
|
83
83
|
trailing_whitespace = " "
|
84
84
|
end
|
85
85
|
[leading_whitespace, trailing_whitespace]
|
@@ -4,6 +4,8 @@ module Coradoc::Input::HTML
|
|
4
4
|
# is compatible with what we would get out of Coradoc, if
|
5
5
|
# it parsed it directly.
|
6
6
|
class Postprocessor
|
7
|
+
Element = Coradoc::Element
|
8
|
+
|
7
9
|
def self.process(coradoc)
|
8
10
|
new(coradoc).process
|
9
11
|
end
|
@@ -12,17 +14,74 @@ module Coradoc::Input::HTML
|
|
12
14
|
@tree = coradoc
|
13
15
|
end
|
14
16
|
|
17
|
+
# Extracts titles from lists. This happens in HTML files
|
18
|
+
# generated from DOCX documents by LibreOffice.
|
19
|
+
#
|
20
|
+
# We are interested in a particular tree:
|
21
|
+
# Element::List::Ordered items:
|
22
|
+
# Element::List::Ordered items: (any depth)
|
23
|
+
# Element::ListItem content:
|
24
|
+
# Element::Title
|
25
|
+
# (any number of other titles of the same scheme)
|
26
|
+
#
|
27
|
+
# This tree is flattened into:
|
28
|
+
# Element::Title
|
29
|
+
# Element::Title (any number of titles)
|
30
|
+
def extract_titles_from_lists
|
31
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
32
|
+
next elem unless dir == :pre
|
33
|
+
next elem unless elem.is_a?(Element::List::Ordered)
|
34
|
+
next elem if elem.items.length != 1
|
35
|
+
|
36
|
+
anchors = []
|
37
|
+
anchors << elem.anchor if elem.anchor
|
38
|
+
|
39
|
+
# Extract ListItem from any depth of List::Ordered
|
40
|
+
processed = elem
|
41
|
+
while processed.is_a?(Element::List::Ordered)
|
42
|
+
if processed.items.length != 1
|
43
|
+
backtrack = true
|
44
|
+
break
|
45
|
+
end
|
46
|
+
anchors << processed.anchor if processed.anchor
|
47
|
+
processed = processed.items.first
|
48
|
+
end
|
49
|
+
|
50
|
+
# Something went wrong? Anything not matching on the way?
|
51
|
+
next elem if backtrack
|
52
|
+
next elem unless processed.is_a?(Element::ListItem)
|
53
|
+
|
54
|
+
anchors << processed.anchor if processed.anchor
|
55
|
+
|
56
|
+
# Now we must have a title (or titles).
|
57
|
+
titles = processed.content.flatten
|
58
|
+
|
59
|
+
# Don't bother if there's no title in there.
|
60
|
+
next elem unless titles.any? { |i| i.is_a? Element::Title }
|
61
|
+
|
62
|
+
# Ordered is another iteration for our cleanup.
|
63
|
+
next elem unless titles.all? do |i|
|
64
|
+
i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
|
65
|
+
end
|
66
|
+
|
67
|
+
# We are done now.
|
68
|
+
titles + anchors
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
15
72
|
# Collapse DIVs that only have a title, or nest another DIV.
|
16
73
|
def collapse_meaningless_sections
|
17
|
-
@tree =
|
18
|
-
if elem.is_a?(
|
74
|
+
@tree = Element::Base.visit(@tree) do |elem, _dir|
|
75
|
+
if elem.is_a?(Element::Section) && elem.safe_to_collapse?
|
19
76
|
children_classes = Array(elem.contents).map(&:class)
|
20
77
|
count = children_classes.length
|
21
|
-
safe_classes = [
|
78
|
+
safe_classes = [Element::Section, Element::Title]
|
22
79
|
|
23
80
|
# Count > 0 because some documents use <div> as a <br>.
|
24
81
|
if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
|
25
|
-
|
82
|
+
contents = elem.contents.dup
|
83
|
+
contents.prepend(elem.anchor) if elem.anchor
|
84
|
+
next contents
|
26
85
|
end
|
27
86
|
end
|
28
87
|
elem
|
@@ -32,12 +91,14 @@ module Coradoc::Input::HTML
|
|
32
91
|
# tree should now be more cleaned up, so we can progress with
|
33
92
|
# creating meaningful sections
|
34
93
|
def generate_meaningful_sections
|
35
|
-
@tree =
|
94
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
36
95
|
# We are searching for an array, that has a title. This
|
37
96
|
# will be a candidate for our section array.
|
38
97
|
if dir == :post &&
|
39
98
|
elem.is_a?(Array) &&
|
40
|
-
!elem.grep(
|
99
|
+
!elem.flatten.grep(Element::Title).empty?
|
100
|
+
|
101
|
+
elem = elem.flatten
|
41
102
|
|
42
103
|
new_array = []
|
43
104
|
content_array = new_array
|
@@ -47,12 +108,12 @@ module Coradoc::Input::HTML
|
|
47
108
|
# all descendant sections into those sections. Otherwise, we push
|
48
109
|
# an element as content of current section.
|
49
110
|
elem.each do |e|
|
50
|
-
if e.is_a?
|
111
|
+
if e.is_a? Element::Title
|
51
112
|
title = e
|
52
113
|
content_array = []
|
53
114
|
section_array = []
|
54
115
|
level = title.level_int
|
55
|
-
section =
|
116
|
+
section = Element::Section.new(
|
56
117
|
title, contents: content_array, sections: section_array
|
57
118
|
)
|
58
119
|
# Some documents may not be consistent and eg. follow H4 after
|
@@ -82,11 +143,11 @@ module Coradoc::Input::HTML
|
|
82
143
|
previous_sections = {}
|
83
144
|
|
84
145
|
determine_section_id = ->(elem) do
|
85
|
-
if elem.title.style == "appendix"
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
146
|
+
level = if elem.title.style == "appendix"
|
147
|
+
"A"
|
148
|
+
else
|
149
|
+
1
|
150
|
+
end
|
90
151
|
|
91
152
|
section = previous_sections[elem]
|
92
153
|
while section
|
@@ -102,8 +163,8 @@ module Coradoc::Input::HTML
|
|
102
163
|
style
|
103
164
|
end
|
104
165
|
|
105
|
-
@tree =
|
106
|
-
title = elem.title if elem.is_a?(
|
166
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
167
|
+
title = elem.title if elem.is_a?(Element::Section)
|
107
168
|
|
108
169
|
if title && title.level_int <= max_level
|
109
170
|
if dir == :pre
|
@@ -137,6 +198,7 @@ module Coradoc::Input::HTML
|
|
137
198
|
end
|
138
199
|
|
139
200
|
def process
|
201
|
+
extract_titles_from_lists
|
140
202
|
collapse_meaningless_sections
|
141
203
|
generate_meaningful_sections
|
142
204
|
# Do it again to simplify the document further.
|
data/lib/coradoc/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coradoc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2024-
|
12
|
+
date: 2024-11-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marcel
|