reverse_adoc 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +7 -6
- data/bin/reverse_adoc +3 -0
- data/bin/w2a +4 -17
- data/lib/reverse_asciidoctor/cleaner.rb +23 -0
- data/lib/reverse_asciidoctor/converters/h.rb +17 -2
- data/lib/reverse_asciidoctor/converters/img.rb +21 -23
- data/lib/reverse_asciidoctor/version.rb +1 -1
- data/reverse_adoc.gemspec +1 -0
- data/spec/assets/anchors.html +2 -0
- data/spec/assets/test.docx +0 -0
- data/spec/assets/test.html +35 -0
- data/spec/components/anchors_spec.rb +1 -0
- data/spec/lib/reverse_asciidoctor/cleaner_spec.rb +24 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3d387a4fbedd246900150be67a80bf62ee70a70cf697b1a2a7a247080d879ad
|
4
|
+
data.tar.gz: 956f8db0ca1d2f34e1f165f323e6db6d1f5e68a91dc886d6fd3cf3f1f7aa0886
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a72c96ffc8e41f7e38b28108107b12bec33a22e3b1722416e7b4d5a493ab3f47d6c3fab49a98b27969ba8235cb3ed6fff86f027a8c9ff5e4b3276af8d2d0a752
|
7
|
+
data.tar.gz: 1f0de4f61007133d49c0dfcd0dd85e8a87f8445a59fb41ba5dd1001eeec4e1405402449389b3e1355a453ff5a21bf6fa7d4b8bfd1fc883c654feee83600e55f3
|
data/README.adoc
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
= AsciiDoc from HTML and Microsoft Word: reverse_adoc
|
2
2
|
|
3
|
-
image:https://img.shields.io/gem/v/
|
4
|
-
image:https://
|
5
|
-
image:https://
|
6
|
-
image:https://
|
7
|
-
image:https://
|
8
|
-
image:https://img.shields.io/github/
|
3
|
+
https://github.com/metanorma/reverse_adoc[reverse_adoc] image:https://img.shields.io/gem/v/reverse_adoc.svg["Gem Version", link="https://rubygems.org/gems/reverse_adoc"]::
|
4
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/macos/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=macos"]
|
5
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/windows/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=windows"]
|
6
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/ubuntu/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=ubuntu"]
|
7
|
+
image:https://codeclimate.com/github/metanorma/reverse_adoc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/reverse_adoc"]
|
8
|
+
image:https://img.shields.io/github/issues-pr-raw/metanorma/reverse_adoc.svg["Pull Requests", link="https://github.com/metanorma/reverse_adoc/pulls"]
|
9
|
+
image:https://img.shields.io/github/commits-since/metanorma/reverse_adoc/latest.svg["Commits since latest",link="https://github.com/metanorma/reverse_adoc/releases"]
|
9
10
|
|
10
11
|
== Purpose
|
11
12
|
|
data/bin/reverse_adoc
CHANGED
data/bin/w2a
CHANGED
@@ -1,25 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# frozen_string_literal: true
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
3
5
|
|
4
6
|
require 'word-to-markdown'
|
5
7
|
require 'optparse'
|
6
8
|
require 'reverse_asciidoctor'
|
7
9
|
|
8
|
-
def scrub_whitespace(string)
|
9
|
-
string = string.dup
|
10
|
-
string.gsub!(/ |\ |\u00a0/i, ' ') # HTML encoded spaces
|
11
|
-
string.sub!(/^\A[[:space:]]+/m, '') # document leading whitespace
|
12
|
-
string.sub!(/[[:space:]]+\z$/m, '') # document trailing whitespace
|
13
|
-
string.gsub!(/([ ]+)$/, ' ') # line trailing whitespace
|
14
|
-
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
15
|
-
#string.delete!(' ') # Unicode non-breaking spaces, injected as tabs
|
16
|
-
# following added by me
|
17
|
-
string.gsub!(%r{<h[1-9][^>]*></h1>}, " ") # I don't know why Libre Office is inserting them, but they need to go
|
18
|
-
string.gsub!(%r{<h1[^>]* style="vertical-align: super;[^>]*>([^<]+)</h1>},
|
19
|
-
"<sup>\\1</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1
|
20
|
-
string
|
21
|
-
end
|
22
|
-
|
23
10
|
ARGV.push('-h') if ARGV.empty?
|
24
11
|
|
25
12
|
OptionParser.new do |opts|
|
@@ -66,9 +53,9 @@ ReverseAsciidoctor.config.sourcedir = Dir.mktmpdir
|
|
66
53
|
# puts "ReverseAsciidoctor.config.sourcedir #{ReverseAsciidoctor.config.sourcedir}"
|
67
54
|
|
68
55
|
doc = WordToMarkdown.new(filename, ReverseAsciidoctor.config.sourcedir)
|
69
|
-
|
56
|
+
File.open("test.html", "w:UTF-8") { |f| f.write doc.document.html }
|
70
57
|
adoc_content = ReverseAsciidoctor.convert(
|
71
|
-
|
58
|
+
ReverseAsciidoctor.cleaner.preprocess_word_html(doc.document.html),
|
72
59
|
WordToMarkdown::REVERSE_MARKDOWN_OPTIONS
|
73
60
|
)
|
74
61
|
# puts scrub_whitespace(doc.document.html)
|
@@ -64,6 +64,29 @@ module ReverseAsciidoctor
|
|
64
64
|
string.gsub(/(\*\*|~~|__)\s([\.!\?'"])/, "\\1".strip + "\\2")
|
65
65
|
end
|
66
66
|
|
67
|
+
# preprocesses HTML, rather than postprocessing it
|
68
|
+
def preprocess_word_html(string)
|
69
|
+
clean_headings(scrub_whitespace(string.dup))
|
70
|
+
end
|
71
|
+
|
72
|
+
def scrub_whitespace(string)
|
73
|
+
string.gsub!(/ |\ |\u00a0/i, ' ') # HTML encoded spaces
|
74
|
+
string.sub!(/^\A[[:space:]]+/m, '') # document leading whitespace
|
75
|
+
string.sub!(/[[:space:]]+\z$/m, '') # document trailing whitespace
|
76
|
+
string.gsub!(/([ ]+)$/, ' ') # line trailing whitespace
|
77
|
+
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
78
|
+
#string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
|
79
|
+
string
|
80
|
+
end
|
81
|
+
|
82
|
+
# following added by me
|
83
|
+
def clean_headings(string)
|
84
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go
|
85
|
+
string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
86
|
+
"<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1
|
87
|
+
string
|
88
|
+
end
|
89
|
+
|
67
90
|
private
|
68
91
|
|
69
92
|
def preserve_border_whitespaces(string, options = {}, &block)
|
@@ -3,9 +3,24 @@ module ReverseAsciidoctor
|
|
3
3
|
class H < Base
|
4
4
|
def convert(node, state = {})
|
5
5
|
id = node['id']
|
6
|
-
anchor = id ? "[[#{id}]]
|
6
|
+
anchor = id ? "[[#{id}]]" : ""
|
7
|
+
internal_anchor = treat_children_anchors(node, state) || ""
|
8
|
+
anchor.empty? and anchor = internal_anchor
|
9
|
+
anchor.empty? or anchor += "\n"
|
7
10
|
prefix = '=' * (node.name[/\d/].to_i + 1)
|
8
|
-
["\n", anchor, prefix, ' ',
|
11
|
+
["\n", anchor, prefix, ' ', treat_children_no_anchors(node, state), "\n"].join
|
12
|
+
end
|
13
|
+
|
14
|
+
def treat_children_no_anchors(node, state)
|
15
|
+
node.children.reject { |a| a.name == "a" }.inject('') do |memo, child|
|
16
|
+
memo << treat(child, state)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def treat_children_anchors(node, state)
|
21
|
+
node.children.select { |a| a.name == "a" }.inject('') do |memo, child|
|
22
|
+
memo << treat(child, state)
|
23
|
+
end
|
9
24
|
end
|
10
25
|
end
|
11
26
|
|
@@ -1,5 +1,8 @@
|
|
1
1
|
require "fileutils"
|
2
2
|
require "pathname"
|
3
|
+
require "tempfile"
|
4
|
+
require "base64"
|
5
|
+
require "mimemagic"
|
3
6
|
|
4
7
|
module ReverseAsciidoctor
|
5
8
|
module Converters
|
@@ -23,29 +26,7 @@ module ReverseAsciidoctor
|
|
23
26
|
images_dir = dest_dir + 'images'
|
24
27
|
FileUtils.mkdir_p(images_dir)
|
25
28
|
|
26
|
-
ext =
|
27
|
-
|
28
|
-
if imgdata
|
29
|
-
file = Tempfile.open(["radoc", ".jpg"]) do |f|
|
30
|
-
begin
|
31
|
-
f.binmode
|
32
|
-
f.write(Base64.strict_decode64(imgdata))
|
33
|
-
f.rewind
|
34
|
-
ext = MimeMagic.by_magic(f)
|
35
|
-
ensure
|
36
|
-
f.close!
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
image_src_path = file.path
|
41
|
-
# puts "tempfile: #{file}"
|
42
|
-
|
43
|
-
else
|
44
|
-
ext = File.extname(src).strip.downcase[1..-1]
|
45
|
-
image_src_path = Pathname.new(ReverseAsciidoctor.config.sourcedir) + src
|
46
|
-
|
47
|
-
end
|
48
|
-
|
29
|
+
ext, image_src_path = determine_image_src_path(imgdata)
|
49
30
|
image_dest_path = images_dir + "#{image_number}.#{ext}"
|
50
31
|
|
51
32
|
# puts "image_dest_path: #{image_dest_path.to_s}"
|
@@ -57,6 +38,23 @@ module ReverseAsciidoctor
|
|
57
38
|
image_dest_path.relative_path_from(dest_dir)
|
58
39
|
end
|
59
40
|
|
41
|
+
def determine_image_src_path(imgdata)
|
42
|
+
return copy_temp_file(imgdata) if imgdata
|
43
|
+
|
44
|
+
ext = File.extname(src).strip.downcase[1..-1]
|
45
|
+
[ext, Pathname.new(ReverseAsciidoctor.config.sourcedir) + src]
|
46
|
+
end
|
47
|
+
|
48
|
+
def copy_temp_file(imgdata)
|
49
|
+
Tempfile.open(['radoc', '.jpg']) do |f|
|
50
|
+
f.binmode
|
51
|
+
f.write(Base64.strict_decode64(imgdata))
|
52
|
+
f.rewind
|
53
|
+
ext = MimeMagic.by_magic(f).subtype
|
54
|
+
[ext, f.path]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
60
58
|
def convert(node, state = {})
|
61
59
|
alt = node['alt']
|
62
60
|
src = node['src']
|
data/reverse_adoc.gemspec
CHANGED
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
|
|
22
22
|
# specify any dependencies here; for example:
|
23
23
|
s.add_dependency 'nokogiri', ">= 1.10.4"
|
24
24
|
s.add_dependency 'mathml2asciimath'
|
25
|
+
s.add_dependency 'mimemagic'
|
25
26
|
s.add_development_dependency 'rspec'
|
26
27
|
s.add_development_dependency 'simplecov'
|
27
28
|
s.add_development_dependency 'rake'
|
data/spec/assets/anchors.html
CHANGED
Binary file
|
@@ -0,0 +1,35 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
|
2
|
+
<?xml version="1.0" encoding="UTF-8"??><html xmlns="http://www.w3.org/1999/xhtml" style="margin: 0;">
|
3
|
+
<!--This file was converted to xhtml by LibreOffice - see http://cgit.freedesktop.org/libreoffice/core/tree/filter/source/xslt for the code.--><head profile="http://dublincore.org/documents/dcmi-terms/">
|
4
|
+
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8">
|
5
|
+
<meta name="DCTERMS.title" content="" xml:lang="en-US">
|
6
|
+
<meta name="DCTERMS.language" content="en-US" scheme="DCTERMS.RFC4646">
|
7
|
+
<meta name="DCTERMS.source" content="http://xml.openoffice.org/odf2xhtml">
|
8
|
+
<meta name="DCTERMS.creator" content="Nick Nicholas">
|
9
|
+
<meta name="DCTERMS.issued" content="2019-11-21T08:48:00" scheme="DCTERMS.W3CDTF">
|
10
|
+
<meta name="DCTERMS.contributor" content="Nick Nicholas">
|
11
|
+
<meta name="DCTERMS.modified" content="2019-11-21T10:01:00" scheme="DCTERMS.W3CDTF">
|
12
|
+
<meta name="DCTERMS.provenance" content="" xml:lang="en-US">
|
13
|
+
<meta name="DCTERMS.subject" content="," xml:lang="en-US">
|
14
|
+
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" hreflang="en">
|
15
|
+
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" hreflang="en">
|
16
|
+
<link rel="schema.DCTYPE" href="http://purl.org/dc/dcmitype/" hreflang="en">
|
17
|
+
<link rel="schema.DCAM" href="http://purl.org/dc/dcam/" hreflang="en">
|
18
|
+
</head>
|
19
|
+
<body dir="ltr" style="max-width: 21.001cm; margin: 2.54cm 3.175cm;">
|
20
|
+
<h1 class="P3" style="clear: both; color: #2f5496; font-size: 16pt; font-family: Calibri Light; writing-mode: lr-tb; margin: 0.423cm 0 0cm;" align="left ! important">
|
21
|
+
<a id="a__Hello" style="margin: 0;"><span style="margin: 0;"></span></a><span class="T1" style="margin: 0;">Hello</span>
|
22
|
+
</h1>
|
23
|
+
<p class="P1" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"> </p>
|
24
|
+
<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><h1 class="T3" style="vertical-align: super; font-size: 58%; margin: 0;">2</h1><span class="T1" style="margin: 0;">0</span></p>
|
25
|
+
<p class="P1" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"> </p>
|
26
|
+
<!--Next 'div' was a 'text:p'.--><div class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important">
|
27
|
+
<!--Next ' span' is a draw:frame. --><span style="margin: 0;"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block" style="margin: 0;"><mrow style="margin: 0;"><mrow style="margin: 0;"><mrow style="margin: 0;"></mrow><mrow style="margin: 0;"><mi style="margin: 0;">i</mi><mo stretchy="false" style="margin: 0;">=</mo><mn style="margin: 0;">1</mn></mrow></mrow><mrow style="margin: 0;"><mrow style="margin: 0;"></mrow><mi style="margin: 0;">n</mi><msubsup style="margin: 0;"><mi style="margin: 0;">β</mi><mn style="margin: 0;">2</mn><mi style="margin: 0;">i</mi></msubsup></mrow></mrow></math></span>
|
28
|
+
</div>
|
29
|
+
<div style="clear: both; line-height: 0; width: 0; height: 0; margin: 0; padding: 0;"> </div>
|
30
|
+
<!--Next 'div' was a 'text:p'.--><div class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important">
|
31
|
+
<a id="_GoBack" style="margin: 0;"></a><!--Next ' span' is a draw:frame. --><span style="height: 1.764cm; margin: 0cm; padding: 0; border: none; width: 1.764cm; font-size: 12pt; font-family: Calibri; text-align: center; vertical-align: top; background-color: transparent;" class="fr1" id="Εικόνα_2"><img style="height: 1.764cm; width: 1.764cm; margin: 0;" alt="" src=""></span>
|
32
|
+
</div>
|
33
|
+
<div style="clear: both; line-height: 0; width: 0; height: 0; margin: 0; padding: 0;"> </div>
|
34
|
+
</body>
|
35
|
+
</html>
|
@@ -20,6 +20,7 @@ describe ReverseAsciidoctor do
|
|
20
20
|
|
21
21
|
it { is_expected.to include "<<a_bspaced,Double \\_\\_ anchor with space>>" }
|
22
22
|
it { is_expected.to include "[[a_bspaced]]" }
|
23
|
+
it { is_expected.to include "[[a_Foreword]]\n== Text" }
|
23
24
|
it { is_expected.not_to include "[[_Toc12345]]" }
|
24
25
|
|
25
26
|
end
|
@@ -3,6 +3,30 @@ require 'spec_helper'
|
|
3
3
|
describe ReverseAsciidoctor::Cleaner do
|
4
4
|
let(:cleaner) { ReverseAsciidoctor::Cleaner.new }
|
5
5
|
|
6
|
+
describe '#scrub_whitespace' do
|
7
|
+
it "makes consistent nonbreaking spaces" do
|
8
|
+
result = cleaner.scrub_whitespace("   ")
|
9
|
+
expect(result).to eq "     "
|
10
|
+
end
|
11
|
+
|
12
|
+
it "makes four linebreaks into two" do
|
13
|
+
result = cleaner.scrub_whitespace("A\n\n\n\nB")
|
14
|
+
expect(result).to eq "A\n\nB"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#clean_headings' do
|
19
|
+
it "removes empty headings" do
|
20
|
+
result = cleaner.clean_headings("<h2></h2>")
|
21
|
+
expect(result).to eq " "
|
22
|
+
end
|
23
|
+
|
24
|
+
it "cleans superscripts rendered as headings" do
|
25
|
+
result = cleaner.clean_headings(%{<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><h1 class="T2" style="vertical-align: super; font-size: 58%; margin: 0;">2</h1><span class="T1" style="margin: 0;">0</span></p>})
|
26
|
+
expect(result).to eq %{<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><sup>2</sup><span class="T1" style="margin: 0;">0</span></p>}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
6
30
|
describe '#remove_newlines' do
|
7
31
|
it 'removes more than 2 subsequent newlines' do
|
8
32
|
result = cleaner.remove_newlines("foo\n\n\nbar")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reverse_adoc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: mimemagic
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -195,6 +209,8 @@ files:
|
|
195
209
|
- spec/assets/paragraphs.html
|
196
210
|
- spec/assets/quotation.html
|
197
211
|
- spec/assets/tables.html
|
212
|
+
- spec/assets/test.docx
|
213
|
+
- spec/assets/test.html
|
198
214
|
- spec/assets/unknown_tags.html
|
199
215
|
- spec/components/anchors_spec.rb
|
200
216
|
- spec/components/basic_spec.rb
|
@@ -265,6 +281,8 @@ test_files:
|
|
265
281
|
- spec/assets/paragraphs.html
|
266
282
|
- spec/assets/quotation.html
|
267
283
|
- spec/assets/tables.html
|
284
|
+
- spec/assets/test.docx
|
285
|
+
- spec/assets/test.html
|
268
286
|
- spec/assets/unknown_tags.html
|
269
287
|
- spec/components/anchors_spec.rb
|
270
288
|
- spec/components/basic_spec.rb
|