reverse_adoc 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +7 -6
- data/bin/reverse_adoc +3 -0
- data/bin/w2a +4 -17
- data/lib/reverse_asciidoctor/cleaner.rb +23 -0
- data/lib/reverse_asciidoctor/converters/h.rb +17 -2
- data/lib/reverse_asciidoctor/converters/img.rb +21 -23
- data/lib/reverse_asciidoctor/version.rb +1 -1
- data/reverse_adoc.gemspec +1 -0
- data/spec/assets/anchors.html +2 -0
- data/spec/assets/test.docx +0 -0
- data/spec/assets/test.html +35 -0
- data/spec/components/anchors_spec.rb +1 -0
- data/spec/lib/reverse_asciidoctor/cleaner_spec.rb +24 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3d387a4fbedd246900150be67a80bf62ee70a70cf697b1a2a7a247080d879ad
|
4
|
+
data.tar.gz: 956f8db0ca1d2f34e1f165f323e6db6d1f5e68a91dc886d6fd3cf3f1f7aa0886
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a72c96ffc8e41f7e38b28108107b12bec33a22e3b1722416e7b4d5a493ab3f47d6c3fab49a98b27969ba8235cb3ed6fff86f027a8c9ff5e4b3276af8d2d0a752
|
7
|
+
data.tar.gz: 1f0de4f61007133d49c0dfcd0dd85e8a87f8445a59fb41ba5dd1001eeec4e1405402449389b3e1355a453ff5a21bf6fa7d4b8bfd1fc883c654feee83600e55f3
|
data/README.adoc
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
= AsciiDoc from HTML and Microsoft Word: reverse_adoc
|
2
2
|
|
3
|
-
image:https://img.shields.io/gem/v/
|
4
|
-
image:https://
|
5
|
-
image:https://
|
6
|
-
image:https://
|
7
|
-
image:https://
|
8
|
-
image:https://img.shields.io/github/
|
3
|
+
https://github.com/metanorma/reverse_adoc[reverse_adoc] image:https://img.shields.io/gem/v/reverse_adoc.svg["Gem Version", link="https://rubygems.org/gems/reverse_adoc"]::
|
4
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/macos/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=macos"]
|
5
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/windows/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=windows"]
|
6
|
+
image:https://github.com/metanorma/reverse_adoc/workflows/ubuntu/badge.svg["Build Status", link="https://github.com/metanorma/reverse_adoc/actions?workflow=ubuntu"]
|
7
|
+
image:https://codeclimate.com/github/metanorma/reverse_adoc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/reverse_adoc"]
|
8
|
+
image:https://img.shields.io/github/issues-pr-raw/metanorma/reverse_adoc.svg["Pull Requests", link="https://github.com/metanorma/reverse_adoc/pulls"]
|
9
|
+
image:https://img.shields.io/github/commits-since/metanorma/reverse_adoc/latest.svg["Commits since latest",link="https://github.com/metanorma/reverse_adoc/releases"]
|
9
10
|
|
10
11
|
== Purpose
|
11
12
|
|
data/bin/reverse_adoc
CHANGED
data/bin/w2a
CHANGED
@@ -1,25 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# frozen_string_literal: true
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
3
5
|
|
4
6
|
require 'word-to-markdown'
|
5
7
|
require 'optparse'
|
6
8
|
require 'reverse_asciidoctor'
|
7
9
|
|
8
|
-
def scrub_whitespace(string)
|
9
|
-
string = string.dup
|
10
|
-
string.gsub!(/ |\ |\u00a0/i, ' ') # HTML encoded spaces
|
11
|
-
string.sub!(/^\A[[:space:]]+/m, '') # document leading whitespace
|
12
|
-
string.sub!(/[[:space:]]+\z$/m, '') # document trailing whitespace
|
13
|
-
string.gsub!(/([ ]+)$/, ' ') # line trailing whitespace
|
14
|
-
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
15
|
-
#string.delete!(' ') # Unicode non-breaking spaces, injected as tabs
|
16
|
-
# following added by me
|
17
|
-
string.gsub!(%r{<h[1-9][^>]*></h1>}, " ") # I don't know why Libre Office is inserting them, but they need to go
|
18
|
-
string.gsub!(%r{<h1[^>]* style="vertical-align: super;[^>]*>([^<]+)</h1>},
|
19
|
-
"<sup>\\1</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1
|
20
|
-
string
|
21
|
-
end
|
22
|
-
|
23
10
|
ARGV.push('-h') if ARGV.empty?
|
24
11
|
|
25
12
|
OptionParser.new do |opts|
|
@@ -66,9 +53,9 @@ ReverseAsciidoctor.config.sourcedir = Dir.mktmpdir
|
|
66
53
|
# puts "ReverseAsciidoctor.config.sourcedir #{ReverseAsciidoctor.config.sourcedir}"
|
67
54
|
|
68
55
|
doc = WordToMarkdown.new(filename, ReverseAsciidoctor.config.sourcedir)
|
69
|
-
|
56
|
+
File.open("test.html", "w:UTF-8") { |f| f.write doc.document.html }
|
70
57
|
adoc_content = ReverseAsciidoctor.convert(
|
71
|
-
|
58
|
+
ReverseAsciidoctor.cleaner.preprocess_word_html(doc.document.html),
|
72
59
|
WordToMarkdown::REVERSE_MARKDOWN_OPTIONS
|
73
60
|
)
|
74
61
|
# puts scrub_whitespace(doc.document.html)
|
@@ -64,6 +64,29 @@ module ReverseAsciidoctor
|
|
64
64
|
string.gsub(/(\*\*|~~|__)\s([\.!\?'"])/, "\\1".strip + "\\2")
|
65
65
|
end
|
66
66
|
|
67
|
+
# preprocesses HTML, rather than postprocessing it
|
68
|
+
def preprocess_word_html(string)
|
69
|
+
clean_headings(scrub_whitespace(string.dup))
|
70
|
+
end
|
71
|
+
|
72
|
+
def scrub_whitespace(string)
|
73
|
+
string.gsub!(/ |\ |\u00a0/i, ' ') # HTML encoded spaces
|
74
|
+
string.sub!(/^\A[[:space:]]+/m, '') # document leading whitespace
|
75
|
+
string.sub!(/[[:space:]]+\z$/m, '') # document trailing whitespace
|
76
|
+
string.gsub!(/([ ]+)$/, ' ') # line trailing whitespace
|
77
|
+
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
|
78
|
+
#string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
|
79
|
+
string
|
80
|
+
end
|
81
|
+
|
82
|
+
# following added by me
|
83
|
+
def clean_headings(string)
|
84
|
+
string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go
|
85
|
+
string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
|
86
|
+
"<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1
|
87
|
+
string
|
88
|
+
end
|
89
|
+
|
67
90
|
private
|
68
91
|
|
69
92
|
def preserve_border_whitespaces(string, options = {}, &block)
|
@@ -3,9 +3,24 @@ module ReverseAsciidoctor
|
|
3
3
|
class H < Base
|
4
4
|
def convert(node, state = {})
|
5
5
|
id = node['id']
|
6
|
-
anchor = id ? "[[#{id}]]
|
6
|
+
anchor = id ? "[[#{id}]]" : ""
|
7
|
+
internal_anchor = treat_children_anchors(node, state) || ""
|
8
|
+
anchor.empty? and anchor = internal_anchor
|
9
|
+
anchor.empty? or anchor += "\n"
|
7
10
|
prefix = '=' * (node.name[/\d/].to_i + 1)
|
8
|
-
["\n", anchor, prefix, ' ',
|
11
|
+
["\n", anchor, prefix, ' ', treat_children_no_anchors(node, state), "\n"].join
|
12
|
+
end
|
13
|
+
|
14
|
+
def treat_children_no_anchors(node, state)
|
15
|
+
node.children.reject { |a| a.name == "a" }.inject('') do |memo, child|
|
16
|
+
memo << treat(child, state)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def treat_children_anchors(node, state)
|
21
|
+
node.children.select { |a| a.name == "a" }.inject('') do |memo, child|
|
22
|
+
memo << treat(child, state)
|
23
|
+
end
|
9
24
|
end
|
10
25
|
end
|
11
26
|
|
@@ -1,5 +1,8 @@
|
|
1
1
|
require "fileutils"
|
2
2
|
require "pathname"
|
3
|
+
require "tempfile"
|
4
|
+
require "base64"
|
5
|
+
require "mimemagic"
|
3
6
|
|
4
7
|
module ReverseAsciidoctor
|
5
8
|
module Converters
|
@@ -23,29 +26,7 @@ module ReverseAsciidoctor
|
|
23
26
|
images_dir = dest_dir + 'images'
|
24
27
|
FileUtils.mkdir_p(images_dir)
|
25
28
|
|
26
|
-
ext =
|
27
|
-
|
28
|
-
if imgdata
|
29
|
-
file = Tempfile.open(["radoc", ".jpg"]) do |f|
|
30
|
-
begin
|
31
|
-
f.binmode
|
32
|
-
f.write(Base64.strict_decode64(imgdata))
|
33
|
-
f.rewind
|
34
|
-
ext = MimeMagic.by_magic(f)
|
35
|
-
ensure
|
36
|
-
f.close!
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
image_src_path = file.path
|
41
|
-
# puts "tempfile: #{file}"
|
42
|
-
|
43
|
-
else
|
44
|
-
ext = File.extname(src).strip.downcase[1..-1]
|
45
|
-
image_src_path = Pathname.new(ReverseAsciidoctor.config.sourcedir) + src
|
46
|
-
|
47
|
-
end
|
48
|
-
|
29
|
+
ext, image_src_path = determine_image_src_path(imgdata)
|
49
30
|
image_dest_path = images_dir + "#{image_number}.#{ext}"
|
50
31
|
|
51
32
|
# puts "image_dest_path: #{image_dest_path.to_s}"
|
@@ -57,6 +38,23 @@ module ReverseAsciidoctor
|
|
57
38
|
image_dest_path.relative_path_from(dest_dir)
|
58
39
|
end
|
59
40
|
|
41
|
+
def determine_image_src_path(imgdata)
|
42
|
+
return copy_temp_file(imgdata) if imgdata
|
43
|
+
|
44
|
+
ext = File.extname(src).strip.downcase[1..-1]
|
45
|
+
[ext, Pathname.new(ReverseAsciidoctor.config.sourcedir) + src]
|
46
|
+
end
|
47
|
+
|
48
|
+
def copy_temp_file(imgdata)
|
49
|
+
Tempfile.open(['radoc', '.jpg']) do |f|
|
50
|
+
f.binmode
|
51
|
+
f.write(Base64.strict_decode64(imgdata))
|
52
|
+
f.rewind
|
53
|
+
ext = MimeMagic.by_magic(f).subtype
|
54
|
+
[ext, f.path]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
60
58
|
def convert(node, state = {})
|
61
59
|
alt = node['alt']
|
62
60
|
src = node['src']
|
data/reverse_adoc.gemspec
CHANGED
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
|
|
22
22
|
# specify any dependencies here; for example:
|
23
23
|
s.add_dependency 'nokogiri', ">= 1.10.4"
|
24
24
|
s.add_dependency 'mathml2asciimath'
|
25
|
+
s.add_dependency 'mimemagic'
|
25
26
|
s.add_development_dependency 'rspec'
|
26
27
|
s.add_development_dependency 'simplecov'
|
27
28
|
s.add_development_dependency 'rake'
|
data/spec/assets/anchors.html
CHANGED
Binary file
|
@@ -0,0 +1,35 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
|
2
|
+
<?xml version="1.0" encoding="UTF-8"??><html xmlns="http://www.w3.org/1999/xhtml" style="margin: 0;">
|
3
|
+
<!--This file was converted to xhtml by LibreOffice - see http://cgit.freedesktop.org/libreoffice/core/tree/filter/source/xslt for the code.--><head profile="http://dublincore.org/documents/dcmi-terms/">
|
4
|
+
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8">
|
5
|
+
<meta name="DCTERMS.title" content="" xml:lang="en-US">
|
6
|
+
<meta name="DCTERMS.language" content="en-US" scheme="DCTERMS.RFC4646">
|
7
|
+
<meta name="DCTERMS.source" content="http://xml.openoffice.org/odf2xhtml">
|
8
|
+
<meta name="DCTERMS.creator" content="Nick Nicholas">
|
9
|
+
<meta name="DCTERMS.issued" content="2019-11-21T08:48:00" scheme="DCTERMS.W3CDTF">
|
10
|
+
<meta name="DCTERMS.contributor" content="Nick Nicholas">
|
11
|
+
<meta name="DCTERMS.modified" content="2019-11-21T10:01:00" scheme="DCTERMS.W3CDTF">
|
12
|
+
<meta name="DCTERMS.provenance" content="" xml:lang="en-US">
|
13
|
+
<meta name="DCTERMS.subject" content="," xml:lang="en-US">
|
14
|
+
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" hreflang="en">
|
15
|
+
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" hreflang="en">
|
16
|
+
<link rel="schema.DCTYPE" href="http://purl.org/dc/dcmitype/" hreflang="en">
|
17
|
+
<link rel="schema.DCAM" href="http://purl.org/dc/dcam/" hreflang="en">
|
18
|
+
</head>
|
19
|
+
<body dir="ltr" style="max-width: 21.001cm; margin: 2.54cm 3.175cm;">
|
20
|
+
<h1 class="P3" style="clear: both; color: #2f5496; font-size: 16pt; font-family: Calibri Light; writing-mode: lr-tb; margin: 0.423cm 0 0cm;" align="left ! important">
|
21
|
+
<a id="a__Hello" style="margin: 0;"><span style="margin: 0;"></span></a><span class="T1" style="margin: 0;">Hello</span>
|
22
|
+
</h1>
|
23
|
+
<p class="P1" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"> </p>
|
24
|
+
<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><h1 class="T3" style="vertical-align: super; font-size: 58%; margin: 0;">2</h1><span class="T1" style="margin: 0;">0</span></p>
|
25
|
+
<p class="P1" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"> </p>
|
26
|
+
<!--Next 'div' was a 'text:p'.--><div class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important">
|
27
|
+
<!--Next ' span' is a draw:frame. --><span style="margin: 0;"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block" style="margin: 0;"><mrow style="margin: 0;"><mrow style="margin: 0;"><mrow style="margin: 0;"></mrow><mrow style="margin: 0;"><mi style="margin: 0;">i</mi><mo stretchy="false" style="margin: 0;">=</mo><mn style="margin: 0;">1</mn></mrow></mrow><mrow style="margin: 0;"><mrow style="margin: 0;"></mrow><mi style="margin: 0;">n</mi><msubsup style="margin: 0;"><mi style="margin: 0;">β</mi><mn style="margin: 0;">2</mn><mi style="margin: 0;">i</mi></msubsup></mrow></mrow></math></span>
|
28
|
+
</div>
|
29
|
+
<div style="clear: both; line-height: 0; width: 0; height: 0; margin: 0; padding: 0;"> </div>
|
30
|
+
<!--Next 'div' was a 'text:p'.--><div class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important">
|
31
|
+
<a id="_GoBack" style="margin: 0;"></a><!--Next ' span' is a draw:frame. --><span style="height: 1.764cm; margin: 0cm; padding: 0; border: none; width: 1.764cm; font-size: 12pt; font-family: Calibri; text-align: center; vertical-align: top; background-color: transparent;" class="fr1" id="Εικόνα_2"><img style="height: 1.764cm; width: 1.764cm; margin: 0;" alt="" src=""></span>
|
32
|
+
</div>
|
33
|
+
<div style="clear: both; line-height: 0; width: 0; height: 0; margin: 0; padding: 0;"> </div>
|
34
|
+
</body>
|
35
|
+
</html>
|
@@ -20,6 +20,7 @@ describe ReverseAsciidoctor do
|
|
20
20
|
|
21
21
|
it { is_expected.to include "<<a_bspaced,Double \\_\\_ anchor with space>>" }
|
22
22
|
it { is_expected.to include "[[a_bspaced]]" }
|
23
|
+
it { is_expected.to include "[[a_Foreword]]\n== Text" }
|
23
24
|
it { is_expected.not_to include "[[_Toc12345]]" }
|
24
25
|
|
25
26
|
end
|
@@ -3,6 +3,30 @@ require 'spec_helper'
|
|
3
3
|
describe ReverseAsciidoctor::Cleaner do
|
4
4
|
let(:cleaner) { ReverseAsciidoctor::Cleaner.new }
|
5
5
|
|
6
|
+
describe '#scrub_whitespace' do
|
7
|
+
it "makes consistent nonbreaking spaces" do
|
8
|
+
result = cleaner.scrub_whitespace("   ")
|
9
|
+
expect(result).to eq "     "
|
10
|
+
end
|
11
|
+
|
12
|
+
it "makes four linebreaks into two" do
|
13
|
+
result = cleaner.scrub_whitespace("A\n\n\n\nB")
|
14
|
+
expect(result).to eq "A\n\nB"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#clean_headings' do
|
19
|
+
it "removes empty headings" do
|
20
|
+
result = cleaner.clean_headings("<h2></h2>")
|
21
|
+
expect(result).to eq " "
|
22
|
+
end
|
23
|
+
|
24
|
+
it "cleans superscripts rendered as headings" do
|
25
|
+
result = cleaner.clean_headings(%{<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><h1 class="T2" style="vertical-align: super; font-size: 58%; margin: 0;">2</h1><span class="T1" style="margin: 0;">0</span></p>})
|
26
|
+
expect(result).to eq %{<p class="Standard" style="font-size: 12pt; font-family: Calibri; writing-mode: lr-tb; margin: 0;" align="left ! important"><span class="T1" style="margin: 0;">H</span><sup>2</sup><span class="T1" style="margin: 0;">0</span></p>}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
6
30
|
describe '#remove_newlines' do
|
7
31
|
it 'removes more than 2 subsequent newlines' do
|
8
32
|
result = cleaner.remove_newlines("foo\n\n\nbar")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reverse_adoc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: mimemagic
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -195,6 +209,8 @@ files:
|
|
195
209
|
- spec/assets/paragraphs.html
|
196
210
|
- spec/assets/quotation.html
|
197
211
|
- spec/assets/tables.html
|
212
|
+
- spec/assets/test.docx
|
213
|
+
- spec/assets/test.html
|
198
214
|
- spec/assets/unknown_tags.html
|
199
215
|
- spec/components/anchors_spec.rb
|
200
216
|
- spec/components/basic_spec.rb
|
@@ -265,6 +281,8 @@ test_files:
|
|
265
281
|
- spec/assets/paragraphs.html
|
266
282
|
- spec/assets/quotation.html
|
267
283
|
- spec/assets/tables.html
|
284
|
+
- spec/assets/test.docx
|
285
|
+
- spec/assets/test.html
|
268
286
|
- spec/assets/unknown_tags.html
|
269
287
|
- spec/components/anchors_spec.rb
|
270
288
|
- spec/components/basic_spec.rb
|