coradoc 1.1.2 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/coradoc/element/attribute_list.rb +13 -1
- data/lib/coradoc/element/base.rb +2 -0
- data/lib/coradoc/element/section.rb +2 -2
- data/lib/coradoc/element/text_element.rb +9 -0
- data/lib/coradoc/input/html/converters/base.rb +2 -2
- data/lib/coradoc/input/html/converters/div.rb +1 -0
- data/lib/coradoc/input/html/postprocessor.rb +77 -15
- data/lib/coradoc/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f515fbc05baa87f58f84a59737c9818603c6e9f0fc8835cdeb9bd6be9eb39e4
|
4
|
+
data.tar.gz: ce51ff395a3dfb4bf77c37f6b54fe745a14a7262911df50479088d43392d4927
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3269512745aea59b9780e5df2d872af7c3f19851666868335bc4876a534e50c64df410c5257a335324e9a9e0ad573dacf00058752ee00fd4b830f2f150cadeb2
|
7
|
+
data.tar.gz: 00e4122ec5e234e8e7e54d3e5769e03df15ad0f9851059576dadde7de58c0e2dd87f3c18ab14283574347bda6c95536cf31ee3ce6c202ea8b4c3f0da96f3f399
|
@@ -12,6 +12,16 @@ module Coradoc
|
|
12
12
|
@rejected_named = []
|
13
13
|
end
|
14
14
|
|
15
|
+
def inspect
|
16
|
+
"AttributeList: " +
|
17
|
+
[
|
18
|
+
@positional.map(&:inspect).join(", "),
|
19
|
+
@named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "),
|
20
|
+
(@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"),
|
21
|
+
(@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"),
|
22
|
+
].reject { |i| i == true || i.empty? }.join(", ")
|
23
|
+
end
|
24
|
+
|
15
25
|
def add_positional(*attr)
|
16
26
|
@positional += attr
|
17
27
|
end
|
@@ -65,7 +75,9 @@ module Coradoc
|
|
65
75
|
|
66
76
|
adoc = +""
|
67
77
|
if !@positional.empty?
|
68
|
-
adoc << @positional.map
|
78
|
+
adoc << @positional.map do |p|
|
79
|
+
[nil, ""].include?(p) ? '""' : p
|
80
|
+
end.join(",")
|
69
81
|
end
|
70
82
|
adoc << "," if @positional.any? && @named.any?
|
71
83
|
adoc << @named.map do |k, v|
|
data/lib/coradoc/element/base.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Coradoc
|
2
2
|
module Element
|
3
3
|
class Section < Base
|
4
|
-
attr_accessor :id, :title, :attrs, :contents, :sections
|
4
|
+
attr_accessor :id, :title, :attrs, :contents, :sections, :anchor
|
5
5
|
|
6
6
|
declare_children :id, :title, :contents, :sections
|
7
7
|
|
@@ -49,7 +49,7 @@ module Coradoc
|
|
49
49
|
# HTML element and if it happens inside some other block element, can be
|
50
50
|
# safely collapsed.
|
51
51
|
def safe_to_collapse?
|
52
|
-
@title.nil? && @
|
52
|
+
@title.nil? && @sections.empty?
|
53
53
|
end
|
54
54
|
|
55
55
|
private
|
@@ -15,6 +15,15 @@ module Coradoc
|
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
+
def inspect
|
19
|
+
str = "TextElement"
|
20
|
+
str += "(#{@id})" if @id
|
21
|
+
str += ": "
|
22
|
+
str += @content.inspect
|
23
|
+
str += " + #{@line_break.inspect}" unless line_break.empty?
|
24
|
+
str
|
25
|
+
end
|
26
|
+
|
18
27
|
def to_adoc
|
19
28
|
Coradoc::Generator.gen_adoc(@content) + @line_break
|
20
29
|
end
|
@@ -72,14 +72,14 @@ module Coradoc::Input::HTML
|
|
72
72
|
leading_whitespace = $1
|
73
73
|
if !leading_whitespace.nil?
|
74
74
|
first_text = node.at_xpath("./text()[1]")
|
75
|
-
first_text.replace(first_text.text.lstrip)
|
75
|
+
first_text.replace(first_text.text.lstrip) if first_text
|
76
76
|
leading_whitespace = " "
|
77
77
|
end
|
78
78
|
node.text =~ /(\s+)$/
|
79
79
|
trailing_whitespace = $1
|
80
80
|
if !trailing_whitespace.nil?
|
81
81
|
last_text = node.at_xpath("./text()[last()]")
|
82
|
-
last_text.replace(last_text.text.rstrip)
|
82
|
+
last_text.replace(last_text.text.rstrip) if last_text
|
83
83
|
trailing_whitespace = " "
|
84
84
|
end
|
85
85
|
[leading_whitespace, trailing_whitespace]
|
@@ -4,6 +4,8 @@ module Coradoc::Input::HTML
|
|
4
4
|
# is compatible with what we would get out of Coradoc, if
|
5
5
|
# it parsed it directly.
|
6
6
|
class Postprocessor
|
7
|
+
Element = Coradoc::Element
|
8
|
+
|
7
9
|
def self.process(coradoc)
|
8
10
|
new(coradoc).process
|
9
11
|
end
|
@@ -12,17 +14,74 @@ module Coradoc::Input::HTML
|
|
12
14
|
@tree = coradoc
|
13
15
|
end
|
14
16
|
|
17
|
+
# Extracts titles from lists. This happens in HTML files
|
18
|
+
# generated from DOCX documents by LibreOffice.
|
19
|
+
#
|
20
|
+
# We are interested in a particular tree:
|
21
|
+
# Element::List::Ordered items:
|
22
|
+
# Element::List::Ordered items: (any depth)
|
23
|
+
# Element::ListItem content:
|
24
|
+
# Element::Title
|
25
|
+
# (any number of other titles of the same scheme)
|
26
|
+
#
|
27
|
+
# This tree is flattened into:
|
28
|
+
# Element::Title
|
29
|
+
# Element::Title (any number of titles)
|
30
|
+
def extract_titles_from_lists
|
31
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
32
|
+
next elem unless dir == :pre
|
33
|
+
next elem unless elem.is_a?(Element::List::Ordered)
|
34
|
+
next elem if elem.items.length != 1
|
35
|
+
|
36
|
+
anchors = []
|
37
|
+
anchors << elem.anchor if elem.anchor
|
38
|
+
|
39
|
+
# Extract ListItem from any depth of List::Ordered
|
40
|
+
processed = elem
|
41
|
+
while processed.is_a?(Element::List::Ordered)
|
42
|
+
if processed.items.length != 1
|
43
|
+
backtrack = true
|
44
|
+
break
|
45
|
+
end
|
46
|
+
anchors << processed.anchor if processed.anchor
|
47
|
+
processed = processed.items.first
|
48
|
+
end
|
49
|
+
|
50
|
+
# Something went wrong? Anything not matching on the way?
|
51
|
+
next elem if backtrack
|
52
|
+
next elem unless processed.is_a?(Element::ListItem)
|
53
|
+
|
54
|
+
anchors << processed.anchor if processed.anchor
|
55
|
+
|
56
|
+
# Now we must have a title (or titles).
|
57
|
+
titles = processed.content.flatten
|
58
|
+
|
59
|
+
# Don't bother if there's no title in there.
|
60
|
+
next elem unless titles.any? { |i| i.is_a? Element::Title }
|
61
|
+
|
62
|
+
# Ordered is another iteration for our cleanup.
|
63
|
+
next elem unless titles.all? do |i|
|
64
|
+
i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
|
65
|
+
end
|
66
|
+
|
67
|
+
# We are done now.
|
68
|
+
titles + anchors
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
15
72
|
# Collapse DIVs that only have a title, or nest another DIV.
|
16
73
|
def collapse_meaningless_sections
|
17
|
-
@tree =
|
18
|
-
if elem.is_a?(
|
74
|
+
@tree = Element::Base.visit(@tree) do |elem, _dir|
|
75
|
+
if elem.is_a?(Element::Section) && elem.safe_to_collapse?
|
19
76
|
children_classes = Array(elem.contents).map(&:class)
|
20
77
|
count = children_classes.length
|
21
|
-
safe_classes = [
|
78
|
+
safe_classes = [Element::Section, Element::Title]
|
22
79
|
|
23
80
|
# Count > 0 because some documents use <div> as a <br>.
|
24
81
|
if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
|
25
|
-
|
82
|
+
contents = elem.contents.dup
|
83
|
+
contents.prepend(elem.anchor) if elem.anchor
|
84
|
+
next contents
|
26
85
|
end
|
27
86
|
end
|
28
87
|
elem
|
@@ -32,12 +91,14 @@ module Coradoc::Input::HTML
|
|
32
91
|
# tree should now be more cleaned up, so we can progress with
|
33
92
|
# creating meaningful sections
|
34
93
|
def generate_meaningful_sections
|
35
|
-
@tree =
|
94
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
36
95
|
# We are searching for an array, that has a title. This
|
37
96
|
# will be a candidate for our section array.
|
38
97
|
if dir == :post &&
|
39
98
|
elem.is_a?(Array) &&
|
40
|
-
!elem.grep(
|
99
|
+
!elem.flatten.grep(Element::Title).empty?
|
100
|
+
|
101
|
+
elem = elem.flatten
|
41
102
|
|
42
103
|
new_array = []
|
43
104
|
content_array = new_array
|
@@ -47,12 +108,12 @@ module Coradoc::Input::HTML
|
|
47
108
|
# all descendant sections into those sections. Otherwise, we push
|
48
109
|
# an element as content of current section.
|
49
110
|
elem.each do |e|
|
50
|
-
if e.is_a?
|
111
|
+
if e.is_a? Element::Title
|
51
112
|
title = e
|
52
113
|
content_array = []
|
53
114
|
section_array = []
|
54
115
|
level = title.level_int
|
55
|
-
section =
|
116
|
+
section = Element::Section.new(
|
56
117
|
title, contents: content_array, sections: section_array
|
57
118
|
)
|
58
119
|
# Some documents may not be consistent and eg. follow H4 after
|
@@ -82,11 +143,11 @@ module Coradoc::Input::HTML
|
|
82
143
|
previous_sections = {}
|
83
144
|
|
84
145
|
determine_section_id = ->(elem) do
|
85
|
-
if elem.title.style == "appendix"
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
146
|
+
level = if elem.title.style == "appendix"
|
147
|
+
"A"
|
148
|
+
else
|
149
|
+
1
|
150
|
+
end
|
90
151
|
|
91
152
|
section = previous_sections[elem]
|
92
153
|
while section
|
@@ -102,8 +163,8 @@ module Coradoc::Input::HTML
|
|
102
163
|
style
|
103
164
|
end
|
104
165
|
|
105
|
-
@tree =
|
106
|
-
title = elem.title if elem.is_a?(
|
166
|
+
@tree = Element::Base.visit(@tree) do |elem, dir|
|
167
|
+
title = elem.title if elem.is_a?(Element::Section)
|
107
168
|
|
108
169
|
if title && title.level_int <= max_level
|
109
170
|
if dir == :pre
|
@@ -137,6 +198,7 @@ module Coradoc::Input::HTML
|
|
137
198
|
end
|
138
199
|
|
139
200
|
def process
|
201
|
+
extract_titles_from_lists
|
140
202
|
collapse_meaningless_sections
|
141
203
|
generate_meaningful_sections
|
142
204
|
# Do it again to simplify the document further.
|
data/lib/coradoc/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coradoc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2024-
|
12
|
+
date: 2024-11-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marcel
|