coradoc 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71bfd2ac723d58ca75b591bceabcdf834abd9cbbf32e54b96352da21b8f15095
4
- data.tar.gz: 1a7ea6bf80d64ab4c1349c948aa2aee9ed704dc36ddd07fb7cce78293dca86cd
3
+ metadata.gz: 5e44c5e565e224487496ecdd7be8c5c88e5f05fd2a42a4f2cbf31746a908aa25
4
+ data.tar.gz: a1e8fb651b29b516071e91c5c6f7bf72a68c34fd4ae9512a928d3ea3185bc0c6
5
5
  SHA512:
6
- metadata.gz: 555800621a06ffafc07e03a5c2cdca1210de6369d524a606de51df78b1ee9b19aaba30d32d8a448347237e488362a59677781a02502a1d8891fb89aa1cc267f5
7
- data.tar.gz: 22707b18c6ed99fb4c6127885d871875ed1fed2906d2d10dcdd5f4ed9bdd1b1bfe3f19fd092768ccc4d488bdb7e705aba9ba7ab948ceace9cc679f1ee733eddc
6
+ metadata.gz: 4df0a3edcebe6990006c7ca951b59f717f05d9b25eb6481394fe276dbaa318f6016e7a915d268abc19d61cec0605ab7d44466662d499cfa08a0ffa2f98bd8dcc
7
+ data.tar.gz: 9a44116b58e7e99a14ae856b1b06e23c26cbe2913f970a6849615282c3f8e33b05c538c39c72a827fa65aef30f56c9eb494d2f48574ea01f86bca858d9692e16
@@ -9,7 +9,7 @@ module Coradoc
9
9
  end
10
10
 
11
11
  def to_adoc
12
- "\n\n#{gen_delimiter}" << gen_lines << "\n#{gen_delimiter}\n\n"
12
+ "\n\n#{gen_delimiter}\n" << gen_lines << "\n#{gen_delimiter}\n\n"
13
13
  end
14
14
  end
15
15
  end
@@ -12,6 +12,7 @@ module Coradoc
12
12
  @anchor = @id.nil? ? nil : Coradoc::Element::Inline::Anchor.new(@id)
13
13
  @src = src
14
14
  @attributes = options.fetch(:attributes, AttributeList.new)
15
+ @annotate_missing = options.fetch(:annotate_missing)
15
16
  @title = options.fetch(:title, nil)
16
17
  if @attributes.any?
17
18
  @attributes.validate_positional(VALIDATORS_POSITIONAL)
@@ -20,10 +21,11 @@ module Coradoc
20
21
  end
21
22
 
22
23
  def to_adoc
24
+ missing = "// FIXME: Missing image: #{@annotate_missing}\n" if @annotate_missing
23
25
  anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
24
26
  title = ".#{@title}\n" unless @title.to_s.empty?
25
27
  attrs = @attributes.to_adoc
26
- [anchor, title, "image", @colons, @src, attrs].join("")
28
+ [missing, anchor, title, "image", @colons, @src, attrs].join("")
27
29
  end
28
30
 
29
31
  extend AttributeList::Matchers
@@ -25,8 +25,13 @@ module Coradoc
25
25
  @items.each do |item|
26
26
  c = Coradoc::Generator.gen_adoc(item)
27
27
  if !c.empty?
28
- content << prefix.to_s
29
- content << " " if c[0]!=" "
28
+ # If there's a list inside a list directly, we want to
29
+ # skip adding an empty list item.
30
+ # See: https://github.com/metanorma/coradoc/issues/96
31
+ unless item.is_a? List::Core
32
+ content << prefix.to_s
33
+ content << " " if c[0]!=" "
34
+ end
30
35
  content << c
31
36
  end
32
37
  end
@@ -1,9 +1,11 @@
1
1
  module Coradoc
2
2
  module Element
3
3
  module List
4
- class Definition < Core
4
+ class Definition < Base
5
5
  attr_accessor :items, :delimiter
6
6
 
7
+ declare_children :items
8
+
7
9
  def initialize(items, options = {})
8
10
  @items = items
9
11
  @delimiter = options.fetch(:delimiter, "::")
@@ -1,7 +1,7 @@
1
1
  module Coradoc
2
2
  module Element
3
3
  class ListItem < Base
4
- attr_accessor :id
4
+ attr_accessor :id, :content, :anchor
5
5
 
6
6
  declare_children :content, :id, :anchor
7
7
 
@@ -14,8 +14,16 @@ module Coradoc
14
14
  def to_adoc
15
15
  anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
16
16
  content = Array(@content).map do |subitem|
17
- Coradoc::Generator.gen_adoc(subitem).chomp
18
- end.join("\n+\n")
17
+ next if subitem.is_a? Coradoc::Element::Inline::HardLineBreak
18
+
19
+ subcontent = Coradoc::Generator.gen_adoc(subitem)
20
+ # Only try to postprocess elements that are text,
21
+ # otherwise we could strip markup.
22
+ if Coradoc.is_a_single?(subitem, Coradoc::Element::TextElement)
23
+ subcontent = Coradoc.strip_unicode(subcontent)
24
+ end
25
+ subcontent.chomp
26
+ end.compact.join("\n+\n")
19
27
 
20
28
  " #{anchor}#{content.chomp}\n"
21
29
  end
@@ -24,9 +24,9 @@ module Coradoc
24
24
  def to_adoc
25
25
  anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
26
26
  if @tdsinglepara
27
- anchor.to_s << Coradoc::Generator.gen_adoc(@content).strip
27
+ anchor.to_s << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
28
28
  else
29
- "\n\n#{anchor}" << Coradoc::Generator.gen_adoc(@content).strip << "\n\n"
29
+ "\n\n#{anchor}" << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) << "\n\n"
30
30
  end
31
31
  end
32
32
  end
@@ -34,6 +34,12 @@ module Coradoc
34
34
  # with something.
35
35
  content = "&nbsp;#{content}" if content.start_with?(" +\n")
36
36
 
37
+ # Only try to postprocess elements that are text,
38
+ # otherwise we could strip markup.
39
+ if Coradoc.is_a_single?(@contents, Coradoc::Element::TextElement)
40
+ content = Coradoc.strip_unicode(content)
41
+ end
42
+
37
43
  "\n#{anchor}" << title << content << sections << "\n"
38
44
  end
39
45
 
@@ -76,6 +76,11 @@ module Coradoc
76
76
  anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
77
77
  content = simplify_block_content(@content)
78
78
  content = Coradoc::Generator.gen_adoc(content)
79
+ # Only try to postprocess elements that are text,
80
+ # otherwise we could strip markup.
81
+ if Coradoc.is_a_single?(@content, Coradoc::Element::TextElement)
82
+ content = Coradoc.strip_unicode(content)
83
+ end
79
84
  "#{@colrowattr}#{@alignattr}#{@style}| #{anchor}#{content}"
80
85
  end
81
86
  end
@@ -21,7 +21,7 @@ module Coradoc
21
21
 
22
22
  def to_adoc
23
23
  anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
24
- content = Coradoc::Generator.gen_adoc(@content)
24
+ content = Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
25
25
  <<~HERE
26
26
 
27
27
  #{anchor}#{style_str}#{level_str} #{content}
@@ -83,8 +83,7 @@ module Coradoc::ReverseAdoc
83
83
 
84
84
  def scrub_whitespace(string)
85
85
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
86
- string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
87
- string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
86
+ string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
88
87
  string.gsub!(/( +)$/, " ") # line trailing whitespace
89
88
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
90
89
  # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
@@ -33,7 +33,13 @@ module Coradoc::ReverseAdoc
33
33
  # puts "image_dest_path: #{image_dest_path.to_s}"
34
34
  # puts "image_src_path: #{image_src_path.to_s}"
35
35
 
36
- FileUtils.cp(image_src_path, image_dest_path)
36
+ if File.exist?(image_src_path)
37
+ FileUtils.cp(image_src_path, image_dest_path)
38
+ else
39
+ @annotate_missing = image_src_path
40
+ Kernel.warn "Image #{image_src_path} does not exist"
41
+ end
42
+
37
43
  image_number_increment
38
44
 
39
45
  image_dest_path.relative_path_from(dest_dir)
@@ -88,11 +94,12 @@ module Coradoc::ReverseAdoc
88
94
 
89
95
  if src
90
96
  Coradoc::Element::Image::BlockImage.new(title, id, src,
91
- attributes: attributes)
97
+ attributes: attributes,
98
+ annotate_missing: @annotate_missing)
92
99
  end
93
100
  end
94
101
  end
95
102
 
96
- register :img, Img.new
103
+ register :img, Img
97
104
  end
98
105
  end
@@ -53,13 +53,17 @@ module Coradoc::ReverseAdoc
53
53
  rules_attr = rules(node)
54
54
  attrs.add_named("rules", rules_attr) if rules_attr
55
55
 
56
- cols = ensure_row_column_integrity_and_get_column_sizes(node)
57
- attrs.add_named("cols", cols)
58
-
59
- # Header first rows can't span multiple riws - drop header if they do.
60
- header = node.at_xpath(".//tr")
61
- unless header.xpath("./td | ./th").all? { |i| [nil, "1", ""].include? i["rowspan"] }
62
- attrs.add_named("options", ["noheader"])
56
+ # We can't, and shouldn't do those calculation if the table we are
57
+ # processing is empty.
58
+ unless empty?(node)
59
+ cols = ensure_row_column_integrity_and_get_column_sizes(node)
60
+ attrs.add_named("cols", cols)
61
+
62
+ # Header first rows can't span multiple riws - drop header if they do.
63
+ header = node.at_xpath(".//tr")
64
+ unless header.xpath("./td | ./th").all? { |i| [nil, "1", ""].include? i["rowspan"] }
65
+ attrs.add_named("options", ["noheader"])
66
+ end
63
67
  end
64
68
 
65
69
  # This line should be removed.
@@ -68,6 +72,10 @@ module Coradoc::ReverseAdoc
68
72
  attrs
69
73
  end
70
74
 
75
+ def empty?(node)
76
+ !node.at_xpath(".//td | .//th")
77
+ end
78
+
71
79
  def ensure_row_column_integrity_and_get_column_sizes(node)
72
80
  rows = node.xpath(".//tr")
73
81
  num_rows = rows.length
@@ -173,7 +181,9 @@ module Coradoc::ReverseAdoc
173
181
 
174
182
  row_obj = row.last.first.parent
175
183
  doc = row_obj.document
176
- row_obj.add_child(Nokogiri::XML::Node.new("td", doc))
184
+ added_node = Nokogiri::XML::Node.new("td", doc)
185
+ added_node["x-added"] = "x-added"
186
+ row_obj.add_child(added_node)
177
187
 
178
188
  modified = true
179
189
  end
@@ -194,6 +204,21 @@ module Coradoc::ReverseAdoc
194
204
  end
195
205
 
196
206
  unless cell_matrix_correct
207
+ # It may be a special case that we need to add virtual cells at the
208
+ # beginning not the end of a row.
209
+ needs_recompute = false
210
+ cell_matrix.each do |row|
211
+ if row.compact.length != row.length
212
+ last_cell = row.last
213
+ if last_cell["x-added"]
214
+ last_cell.parent.prepend_child(last_cell)
215
+ needs_recompute = true
216
+ end
217
+ end
218
+ end
219
+ recompute.() if needs_recompute
220
+
221
+ # But otherwise... we've got a really nasty table.
197
222
  warn <<~WARNING.gsub("\n", " ")
198
223
  **** Couldn't construct a valid image of a table on line
199
224
  #{node.line}. We need that to reliably compute column
@@ -10,7 +10,9 @@ module Coradoc::ReverseAdoc
10
10
  end
11
11
 
12
12
  def self.lookup(tag_name)
13
- @@converters[tag_name.to_sym] or default_converter(tag_name)
13
+ converter = @@converters[tag_name.to_sym] || default_converter(tag_name)
14
+ converter = converter.new if converter.respond_to? :new
15
+ converter
14
16
  end
15
17
 
16
18
  # Note: process won't run plugin hooks
@@ -30,12 +30,20 @@ module Coradoc::ReverseAdoc
30
30
  html_tree_change_tag_name_by_css(".pitemdata", "h3")
31
31
  html_tree_change_tag_name_by_css(".sitemdata", "h4")
32
32
  html_tree_change_tag_name_by_css('td[bgcolor="#D0CECE"]', "th")
33
+ html_tree_change_tag_name_by_css('td[bgcolor="#d0cece"]', "th")
34
+ html_tree_change_tag_name_by_css('.framedata, .frame_container_box', 'aside')
35
+ html_tree_change_tag_name_by_css('.frame2data', 'pre')
36
+ # Assumption that all code snippets in those documents are XML...
37
+ html_tree_change_properties_by_css(".frame2data", class: "brush:xml;")
33
38
 
34
39
  # Remove some CSS ids that are not important to us
35
40
  html_tree_change_properties_by_css("#__nuxt", id: nil)
36
41
  html_tree_change_properties_by_css("#__layout", id: nil)
37
42
  html_tree_change_properties_by_css("#app", id: nil)
38
43
 
44
+ # Handle lists of document 02
45
+ html_tree_replace_with_children_by_css(".list_num-wrap")
46
+
39
47
  # Convert table/img caption to become a caption
40
48
  html_tree.css(".imagedata").each do |e|
41
49
  table = e.parent.next&.children&.first
@@ -75,66 +83,82 @@ module Coradoc::ReverseAdoc
75
83
  end
76
84
  end
77
85
 
78
- html_tree_add_hook_pre_by_css ".text3data" do |node,|
79
- text = html_tree_process_to_adoc(node).strip
80
- next "" if text.empty? || text == "\u3000"
81
-
82
- text = text.strip.gsub(/^/, "*** ")
83
- "\n\n//-PT3D\n#{text}\n//-ENDPT3D\n\n"
84
- end
85
-
86
- html_tree_add_hook_pre_by_css ".text4data" do |node,|
87
- text = html_tree_process_to_adoc(node).strip
88
- next "" if text.empty? || text == "\u3000"
86
+ (3..4).each do |i|
87
+ html_tree_add_hook_pre_by_css ".text#{i}data" do |node,|
88
+ text = html_tree_process_to_adoc(node).strip
89
+ next "" if text.empty? || text == "\u3000"
89
90
 
90
- text = text.strip.gsub(/^/, "**** ")
91
- "\n\n//-PT4D\n#{text}\n//-ENDPT4D\n\n"
91
+ text = text.strip.gsub(/^/, "#{'*' * i} ")
92
+ "\n\n//-PT#{i}D\n#{text}\n//-ENDPT#{i}D\n\n"
93
+ end
92
94
  end
93
95
 
94
- html_tree_add_hook_pre_by_css ".text2data_point ul" do |node,|
95
- text = html_tree_process_to_adoc(node.children.first.children).strip
96
+ (2..3).each do |i|
97
+ html_tree_add_hook_pre_by_css ".text#{i}data_point ul" do |node,|
98
+ text = html_tree_process_to_adoc(node.children.first.children).strip
96
99
 
97
- "** #{text}\n"
100
+ "#{'*' * i} #{text}\n"
101
+ end
98
102
  end
99
103
 
100
- html_tree_add_hook_pre_by_css ".text3data_point ul" do |node,|
101
- text = html_tree_process_to_adoc(node.children.first.children).strip
104
+ (1..20).each do |i|
105
+ html_tree_add_hook_pre_by_css ".numtextdata_num .list_num#{i}" do |node,|
106
+ text = html_tree_process_to_adoc(node).strip
102
107
 
103
- "*** #{text}\n"
108
+ "[start=#{i}]\n. #{text}\n"
109
+ end
104
110
  end
105
111
 
106
112
  # html_tree_preview
107
113
  end
108
114
 
115
+ IM = /[A-Z0-9]{1,3}/
116
+
109
117
  def handle_headers(node, coradoc, state)
110
- if coradoc.id.start_with?("toc0_")
111
- content = coradoc.content.map(&:content).join
118
+ content = coradoc.content.map(&:content).join
119
+
120
+ if %w[toc0 toc_0].any? { |i| coradoc.id&.start_with?(i) }
112
121
  # Special content
113
122
  case content.strip
114
123
  when "はじめに" # Introduction
115
124
  coradoc.style = "abstract" # The older version document has ".preface"
125
+ coradoc.level_int = 1
116
126
  when "改定の概要" # Revision overview
117
127
  coradoc.style = "abstract" # The older version document has ".preface"
128
+ coradoc.level_int = 1
118
129
  when "参考文献" # Bibliography
119
130
  coradoc.style = "bibliography"
131
+ coradoc.level_int = 1
120
132
  when "改訂履歴" # Document history
121
133
  coradoc.style = "appendix"
134
+ coradoc.level_int = 1
135
+ when "0 概要" # Overview
136
+ coradoc.style = "abstract" # I'm not sure this is correct
137
+ coradoc.level_int = 1
138
+ when "索引" # Index
139
+ coradoc.style = "index" # I'm not sure this is correct
140
+ coradoc.level_int = 1
122
141
  else
123
- warn "Unknown section #{coradoc.content.map(&:content).join.inspect}"
142
+ warn "Unknown section #{content.inspect}"
124
143
  end
144
+ end
125
145
 
126
- # Ensure they are generated as level 1
127
- coradoc.level_int = 1
146
+ if node.name == "h1"
147
+ if content.start_with?("Annex")
148
+ coradoc.style = "appendix"
149
+ coradoc.content.first.content.sub!(/\AAnnex [A-Z]/, "")
150
+ end
128
151
  end
129
152
 
130
153
  # Remove numbers
131
- coradoc.content.first.content.sub!(/\A[\d\s.]+/, "")
154
+ coradoc.content.first.content.sub!(/\A(#{IM}\.)*#{IM}[[:space:]]/, "")
132
155
 
133
156
  coradoc
134
157
  end
135
158
 
136
159
  def handle_headers_h4(node, coradoc, state)
137
- case coradoc.content.first.content
160
+ title = Coradoc.strip_unicode(coradoc.content.first.content)
161
+ case title
138
162
  when /\A\(\d+\)(.*)/
139
163
  coradoc.level_int = 4
140
164
  coradoc.content.first.content = $1.strip
@@ -143,8 +167,16 @@ module Coradoc::ReverseAdoc
143
167
  coradoc.level_int = 5
144
168
  coradoc.content.first.content = $1.strip
145
169
  coradoc
170
+ when /\A#{IM}\.#{IM}\.#{IM}\.#{IM}(.*)/
171
+ coradoc.level_int = 4
172
+ coradoc.content.first.content = $1.strip
146
173
  else
147
- ["// FIXME\n", coradoc]
174
+ if title.empty?
175
+ # Strip instances of faulty empty paragraphs
176
+ nil
177
+ else
178
+ ["// FIXME\n", coradoc]
179
+ end
148
180
  end
149
181
  end
150
182
 
@@ -82,13 +82,18 @@ module Coradoc::ReverseAdoc
82
82
  previous_sections = {}
83
83
 
84
84
  determine_section_id = ->(elem) do
85
- level = 0
86
- section = elem
85
+ if elem.title.style == "appendix"
86
+ level = "A"
87
+ else
88
+ level = 1
89
+ end
90
+
91
+ section = previous_sections[elem]
87
92
  while section
88
- level += 1 if elem.title.style == section.title.style
93
+ level = level.succ if elem.title.style == section.title.style
89
94
  section = previous_sections[section]
90
95
  end
91
- level
96
+ level.is_a?(Integer) ? "%02d" % level : level
92
97
  end
93
98
 
94
99
  determine_style = ->(elem) do
@@ -114,8 +119,7 @@ module Coradoc::ReverseAdoc
114
119
  # include tag.
115
120
  section_file = "sections/"
116
121
  section_file += parent_sections[1..title.level_int].map do |parent|
117
- style = determine_style.(parent)
118
- "%s%02d" % [style, determine_section_id.(parent)]
122
+ determine_style.(parent) + determine_section_id.(parent)
119
123
  end.join("/")
120
124
  section_file += ".adoc"
121
125
 
@@ -0,0 +1,10 @@
1
+ module Coradoc
2
+ def self.strip_unicode(str)
3
+ str.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
4
+ end
5
+
6
+ def self.is_a_single?(obj, klass)
7
+ obj.is_a?(klass) ||
8
+ (obj.is_a?(Array) && obj.length == 1 && obj.first.is_a?(klass))
9
+ end
10
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coradoc
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
  end
data/lib/coradoc.rb CHANGED
@@ -4,6 +4,7 @@ require "pathname"
4
4
 
5
5
  require "parslet"
6
6
  require_relative "coradoc/version"
7
+ require_relative "coradoc/util"
7
8
  require_relative "coradoc/parser"
8
9
  require_relative "coradoc/transformer"
9
10
  require_relative "coradoc/generator"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coradoc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2024-06-01 00:00:00.000000000 Z
12
+ date: 2024-06-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marcel
@@ -356,6 +356,7 @@ files:
356
356
  - lib/coradoc/reverse_adoc/plugins/plateau.rb
357
357
  - lib/coradoc/reverse_adoc/postprocessor.rb
358
358
  - lib/coradoc/transformer.rb
359
+ - lib/coradoc/util.rb
359
360
  - lib/coradoc/version.rb
360
361
  - lib/reverse_adoc.rb
361
362
  - todo.md