isodoc 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ require "htmlentities"
2
+
3
+ module IsoDoc
4
+ class Convert
5
+
6
+ def get_metadata
7
+ @meta
8
+ end
9
+
10
+ def set_metadata(key, value)
11
+ @meta[key] = value
12
+ end
13
+
14
+ def author(isoxml, _out)
15
+ # tc = isoxml.at(ns("//technical-committee"))
16
+ tc_num = isoxml.at(ns("//technical-committee/@number"))
17
+ # sc = isoxml.at(ns("//subcommittee"))
18
+ sc_num = isoxml.at(ns("//subcommittee/@number"))
19
+ # wg = isoxml.at(ns("//workgroup"))
20
+ wg_num = isoxml.at(ns("//workgroup/@number"))
21
+ secretariat = isoxml.at(ns("//secretariat"))
22
+ set_metadata(:tc, "XXXX")
23
+ set_metadata(:sc, "XXXX")
24
+ set_metadata(:wg, "XXXX")
25
+ set_metadata(:secretariat, "XXXX")
26
+ set_metadata(:tc, tc_num.text) if tc_num
27
+ set_metadata(:sc, sc_num.text) if sc_num
28
+ set_metadata(:wg, wg_num.text) if wg_num
29
+ set_metadata(:secretariat, secretariat.text) if secretariat
30
+ end
31
+
32
+ def id(isoxml, _out)
33
+ docnumber = isoxml.at(ns("//project-number"))
34
+ partnumber = isoxml.at(ns("//project-number/@part"))
35
+ documentstatus = isoxml.at(ns("//status/stage"))
36
+ dn = docnumber.text
37
+ dn += "-#{partnumber.text}" if partnumber
38
+ if documentstatus
39
+ set_metadata(:stage, documentstatus.text)
40
+ abbr = stage_abbreviation(documentstatus.text)
41
+ set_metadata(:stageabbr, abbr)
42
+ documentstatus.text.to_i < 60 and
43
+ dn = abbr + " " + dn
44
+ end
45
+ set_metadata(:docnumber, dn)
46
+ end
47
+
48
+ def draftinfo(draft, revdate)
49
+ draftinfo = ""
50
+ if draft
51
+ draftinfo = " (draft #{draft.text}"
52
+ draftinfo += ", #{revdate.text}" if revdate
53
+ draftinfo += ")"
54
+ end
55
+ draftinfo
56
+ end
57
+
58
+ def version(isoxml, _out)
59
+ yr = isoxml.at(ns("//copyright/from"))
60
+ set_metadata(:docyear, yr.text)
61
+ draft = isoxml.at(ns("//version/draft"))
62
+ set_metadata(:draft, draft.nil? ? nil : draft.text)
63
+ revdate = isoxml.at(ns("//version/revision-date"))
64
+ set_metadata(:revdate, revdate.nil? ? nil : revdate.text)
65
+ draftinfo = draftinfo(draft, revdate)
66
+ set_metadata(:draftinfo, draftinfo(draft, revdate))
67
+ end
68
+
69
+
70
+ def compose_title(main, intro, part, partnumber)
71
+ c = HTMLEntities.new
72
+ main = c.encode(main.text, :hexadecimal)
73
+ intro &&
74
+ main = "#{c.encode(intro.text, :hexadecimal)}&nbsp;&mdash; #{main}"
75
+ part &&
76
+ main = "#{main}&nbsp;&mdash; Part&nbsp;#{partnumber}: "\
77
+ "#{c.encode(part.text, :hexadecimal)}"
78
+ main
79
+ end
80
+
81
+ def title(isoxml, _out)
82
+ intro = isoxml.at(ns("//title[@language='en']/title-intro"))
83
+ main = isoxml.at(ns("//title[@language='en']/title-main"))
84
+ part = isoxml.at(ns("//title[@language='en']/title-part"))
85
+ partnumber = isoxml.at(ns("//id/project-number/@part"))
86
+ main = compose_title(main, intro, part, partnumber)
87
+ set_metadata(:doctitle, main)
88
+ end
89
+
90
+ def subtitle(isoxml, _out)
91
+ intro = isoxml.at(ns("//title[@language='fr']/title-intro"))
92
+ main = isoxml.at(ns("//title[@language='fr']/title-main"))
93
+ part = isoxml.at(ns("//title[@language='fr']/title-part"))
94
+ partnumber = isoxml.at(ns("//id/project-number/@part"))
95
+ main = compose_title(main, intro, part, partnumber)
96
+ set_metadata(:docsubtitle, main)
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,156 @@
1
+ require "html2doc"
2
+ require "htmlentities"
3
+ require "nokogiri"
4
+ require "pp"
5
+
6
+ module IsoDoc
7
+ class Convert
8
+
9
+ def postprocess(result, filename, dir)
10
+ generate_header(filename, dir)
11
+ result = from_xhtml(cleanup(to_xhtml(result)))
12
+ toWord(result, filename, dir)
13
+ toHTML(result, filename)
14
+ end
15
+
16
+ def toWord(result, filename, dir)
17
+ result = from_xhtml(wordCleanup(to_xhtml(result)))
18
+ result = populate_template(result)
19
+ Html2Doc.process(result, filename, @wordstylesheet, "header.html",
20
+ dir, ['`', '`'])
21
+ end
22
+
23
+ def wordCleanup(docxml)
24
+ wordPreface(docxml)
25
+ wordAnnexCleanup(docxml)
26
+ docxml
27
+ end
28
+
29
+ # force Annex h2 to be p.h2Annex, so it is not picked up by ToC
30
+ def wordAnnexCleanup(docxml)
31
+ d = docxml.xpath("//h2[ancestor::*[@class = 'Section3']]").each do |h2|
32
+ h2.name = "p"
33
+ h2["class"] = "h2Annex"
34
+ end
35
+ end
36
+
37
+ def wordPreface(docxml)
38
+ cover = to_xhtml_fragment(File.read(@wordcoverpage, encoding: "UTF-8"))
39
+ d = docxml.at('//div[@class="WordSection1"]')
40
+ d.children.first.add_previous_sibling cover.to_xml(encoding: 'US-ASCII')
41
+ intro = to_xhtml_fragment(
42
+ File.read(@wordintropage, encoding: "UTF-8").
43
+ sub(/WORDTOC/, makeWordToC(docxml)))
44
+ d = docxml.at('//div[@class="WordSection2"]')
45
+ d.children.first.add_previous_sibling intro.to_xml(encoding: 'US-ASCII')
46
+ end
47
+
48
+ def populate_template(docxml)
49
+ meta = get_metadata
50
+ docxml.
51
+ gsub(/DOCYEAR/, meta[:docyear]).
52
+ gsub(/DOCNUMBER/, meta[:docnumber]).
53
+ gsub(/TCNUM/, meta[:tc]).
54
+ gsub(/SCNUM/, meta[:sc]).
55
+ gsub(/WGNUM/, meta[:wg]).
56
+ gsub(/DOCTITLE/, meta[:doctitle]).
57
+ gsub(/DOCSUBTITLE/, meta[:docsubtitle]).
58
+ gsub(/SECRETARIAT/, meta[:secretariat]).
59
+ gsub(/[ ]?DRAFTINFO/, meta[:draftinfo]).
60
+ gsub(/\[TERMREF\]\s*/, "[SOURCE: ").
61
+ gsub(/\s*\[\/TERMREF\]\s*/, "]").
62
+ gsub(/\s*\[ISOSECTION\]/, ", ").
63
+ gsub(/\s*\[MODIFICATION\]/, ", modified &mdash; ").
64
+ gsub(%r{WD/CD/DIS/FDIS}, meta[:stageabbr])
65
+ end
66
+
67
+ def generate_header(filename, dir)
68
+ header = File.read(@header, encoding: "UTF-8").
69
+ gsub(/FILENAME/, filename).
70
+ gsub(/DOCYEAR/, get_metadata()[:docyear]).
71
+ gsub(/[ ]?DRAFTINFO/, get_metadata()[:draftinfo]).
72
+ gsub(/DOCNUMBER/, get_metadata()[:docnumber])
73
+ File.open("header.html", "w") do |f|
74
+ f.write(header)
75
+ end
76
+ end
77
+
78
+ # these are in fact preprocess,
79
+ # but they are extraneous to main HTML file
80
+ def html_header(html, docxml, filename, dir)
81
+ anchor_names docxml
82
+ define_head html, filename, dir
83
+ end
84
+
85
+ # isodoc.css overrides any CSS injected by Html2Doc, which
86
+ # is inserted before this CSS.
87
+ def define_head(html, filename, dir)
88
+ html.head do |head|
89
+ head.title { |t| t << filename }
90
+ head.style do |style|
91
+ stylesheet = File.read(@standardstylesheet).
92
+ gsub("FILENAME", filename)
93
+ style.comment "\n#{stylesheet}\n"
94
+ end
95
+ end
96
+ end
97
+
98
+ def titlepage(_docxml, div)
99
+ titlepage = File.read(@wordcoverpage, encoding: "UTF-8")
100
+ div.parent.add_child titlepage
101
+ end
102
+
103
+ def wordTocEntry(toclevel, heading)
104
+ bookmark = Random.rand(1000000000)
105
+ <<~TOC
106
+ <p class="MsoToc#{toclevel}"><span class="MsoHyperlink"><span
107
+ lang="EN-GB" style='mso-no-proof:yes'>
108
+ <a href="#_Toc#{bookmark}">#{heading}<span lang="EN-GB"
109
+ class="MsoTocTextSpan">
110
+ <span style='mso-tab-count:1 dotted'>. </span>
111
+ </span><span lang="EN-GB" class="MsoTocTextSpan">
112
+ <span style='mso-element:field-begin'></span></span>
113
+ <span lang="EN-GB"
114
+ class="MsoTocTextSpan"> PAGEREF _Toc#{bookmark} \\h </span>
115
+ <span lang="EN-GB" class="MsoTocTextSpan"><span
116
+ style='mso-element:field-separator'></span></span><span
117
+ lang="EN-GB" class="MsoTocTextSpan">1</span>
118
+ <span lang="EN-GB"
119
+ class="MsoTocTextSpan"></span><span
120
+ lang="EN-GB" class="MsoTocTextSpan"><span
121
+ style='mso-element:field-end'></span></span></a></span></span></p>
122
+
123
+ TOC
124
+ end
125
+
126
+ WORD_TOC_PREFACE = <<~TOC
127
+ <span lang="EN-GB"><span
128
+ style='mso-element:field-begin'></span><span
129
+ style='mso-spacerun:yes'>&#xA0;</span>TOC
130
+ \\o &quot;1-2&quot; \\h \\z \\u <span
131
+ style='mso-element:field-separator'></span></span>
132
+ TOC
133
+
134
+ WORD_TOC_SUFFIX = <<~TOC
135
+ <p class="MsoToc1"><span lang="EN-GB"><span
136
+ style='mso-element:field-end'></span></span><span
137
+ lang="EN-GB"><o:p>&nbsp;</o:p></span></p>
138
+ TOC
139
+
140
+ def header_strip(h)
141
+ h.to_s.gsub(%r{<br/>}, " ").
142
+ sub(/<h[12][^>]*>/, "").sub(%r{</h[12]>}, "")
143
+ end
144
+
145
+ def makeWordToC(docxml)
146
+ toc = ""
147
+ docxml.xpath("//h1 | //h2[not(ancestor::*[@class = 'Section3'])]").
148
+ each do |h|
149
+ toc += wordTocEntry(h.name == "h1" ? 1 : 2, header_strip(h))
150
+ end
151
+ toc.sub(/(<p class="MsoToc1">)/,
152
+ %{\\1#{WORD_TOC_PREFACE}}) + WORD_TOC_SUFFIX
153
+ end
154
+
155
+ end
156
+ end
@@ -0,0 +1,129 @@
1
+ module IsoDoc
2
+ class Convert
3
+ def iso_bibitem_ref_code(b)
4
+ isocode = b.at(ns("./docidentifier"))
5
+ isodate = b.at(ns("./publishdate"))
6
+ reference = "ISO #{isocode.text}"
7
+ reference += ": #{isodate.text}" if isodate
8
+ reference
9
+ end
10
+
11
+ def date_note_process(b, ref)
12
+ date_note = b.xpath(ns("./note[text()][contains(.,'ISO DATE:')]"))
13
+ unless date_note.empty?
14
+ date_note.first.content =
15
+ date_note.first.content.gsub(/ISO DATE: /, "")
16
+ date_note.wrap("<p></p>")
17
+ footnote_parse(date_note.first, ref)
18
+ end
19
+ end
20
+
21
+ def iso_bibitem_entry(list, b, ordinal, biblio)
22
+ attrs = { id: b["id"], class: biblio ? "Biblio" : nil }
23
+ list.p **attr_code(attrs) do |ref|
24
+ if biblio
25
+ ref << "[#{ordinal}]"
26
+ insert_tab(ref, 1)
27
+ end
28
+ ref << iso_bibitem_ref_code(b)
29
+ date_note_process(b, ref)
30
+ ref << ", " if biblio
31
+ ref.i { |i| i << " #{b.at(ns('./name')).text}" }
32
+ end
33
+ end
34
+
35
+ def ref_entry_code(r, ordinal, t)
36
+ if /^\d+$/.match?(t)
37
+ r << "[#{t}]"
38
+ insert_tab(r, 1)
39
+ else
40
+ r << "[#{ordinal}]"
41
+ insert_tab(r, 1)
42
+ r << "#{t},"
43
+ end
44
+ end
45
+
46
+ def ref_entry(list, b, ordinal, bibliography)
47
+ ref = b.at(ns("./ref"))
48
+ para = b.at(ns("./p"))
49
+ list.p **attr_code("id": ref["id"], class: "Biblio") do |r|
50
+ ref_entry_code(r, ordinal, ref.text.gsub(/[\[\]]/, ""))
51
+ para.children.each { |n| parse(n, r) }
52
+ end
53
+ end
54
+
55
+ def noniso_bibitem(list, b, ordinal, bibliography)
56
+ ref = b.at(ns("./docidentifier"))
57
+ para = b.at(ns("./formatted"))
58
+ list.p **attr_code("id": b["id"], class: "Biblio") do |r|
59
+ ref_entry_code(r, ordinal, ref.text.gsub(/[\[\]]/, ""))
60
+ para.children.each { |n| parse(n, r) }
61
+ end
62
+ end
63
+
64
+ def split_bibitems(f)
65
+ iso_bibitem = []
66
+ non_iso_bibitem = []
67
+ f.xpath(ns("./bibitem")).each do |x|
68
+ if x.at(ns("./publisher/affiliation[name = 'ISO']")).nil?
69
+ non_iso_bibitem << x
70
+ else
71
+ iso_bibitem << x
72
+ end
73
+ end
74
+ { iso: iso_bibitem, noniso: non_iso_bibitem }
75
+ end
76
+
77
+ def biblio_list(f, div, bibliography)
78
+ bibitems = split_bibitems(f)
79
+ bibitems[:iso].each_with_index do |b, i|
80
+ iso_bibitem_entry(div, b, (i + 1), bibliography)
81
+ end
82
+ bibitems[:noniso].each_with_index do |b, i|
83
+ noniso_bibitem(div, b, (i + 1 + bibitems[:iso].size), bibliography)
84
+ end
85
+ end
86
+
87
+ NORM_WITH_REFS_PREF = <<~BOILERPLATE
88
+ The following documents are referred to in the text in such a way
89
+ that some or all of their content constitutes requirements of this
90
+ document. For dated references, only the edition cited applies.
91
+ For undated references, the latest edition of the referenced
92
+ document (including any amendments) applies.
93
+ BOILERPLATE
94
+
95
+ NORM_EMPTY_PREF =
96
+ "There are no normative references in this document."
97
+
98
+ def norm_ref_preface(f, div)
99
+ refs = f.elements.select do |e|
100
+ ["reference", "bibitem"].include? e.name
101
+ end
102
+ pref = refs.empty? ? NORM_EMPTY_PREF : NORM_WITH_REFS_PREF
103
+ div.p pref
104
+ end
105
+
106
+ def norm_ref(isoxml, out)
107
+ q = "//sections/references[title = 'Normative References']"
108
+ f = isoxml.at(ns(q)) or return
109
+ out.div do |div|
110
+ clause_name("2.", "Normative References", div, false)
111
+ norm_ref_preface(f, div)
112
+ biblio_list(f, div, false)
113
+ end
114
+ end
115
+
116
+ def bibliography(isoxml, out)
117
+ q = "//sections/references[title = 'Bibliography']"
118
+ f = isoxml.at(ns(q)) or return
119
+ page_break(out)
120
+ out.div do |div|
121
+ div.h1 "Bibliography", **{ class: "Section3" }
122
+ f.elements.reject do |e|
123
+ ["reference", "title", "bibitem"].include? e.name
124
+ end.each { |e| parse(e, div) }
125
+ biblio_list(f, div, true)
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,136 @@
1
+ module IsoDoc
2
+ class Convert
3
+ def clause_parse(node, out)
4
+ out.div **attr_code("id": node["id"]) do |s|
5
+ node.children.each do |c1|
6
+ if c1.name == "title"
7
+ if node["inline-header"]
8
+ out.span **{ class: "zzMoveToFollowing" } do |s|
9
+ s.b do |b|
10
+ b << "#{get_anchors()[node['id']][:label]}. #{c1.text} "
11
+ end
12
+ end
13
+ else
14
+ s.send "h#{get_anchors()[node['id']][:level]}" do |h|
15
+ h << "#{get_anchors()[node['id']][:label]}. #{c1.text}"
16
+ end
17
+ end
18
+ else
19
+ parse(c1, s)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ def clause_name(num, title, div, inline_header)
26
+ if inline_header
27
+ div.span **{ class: "zzMoveToFollowing" } do |s|
28
+ s.b do |b|
29
+ b << num
30
+ b << title + " "
31
+ end
32
+ end
33
+ else
34
+ div.h1 do |h1|
35
+ h1 << num
36
+ insert_tab(h1, 1)
37
+ h1 << title
38
+ end
39
+ end
40
+ end
41
+
42
+ def clause(isoxml, out)
43
+ isoxml.xpath(ns("//clause[parent::sections]")).each do |c|
44
+ next if c.at(ns("./title")).text == "Scope"
45
+ out.div **attr_code("id": c["id"]) do |s|
46
+ c.elements.each do |c1|
47
+ if c1.name == "title"
48
+ clause_name("#{get_anchors()[c['id']][:label]}.",
49
+ c1.text, s, c["inline-header"])
50
+ else
51
+ parse(c1, s)
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ def annex_name(annex, name, div)
59
+ div.h1 **{ class: "Annex" } do |t|
60
+ t << "#{get_anchors()[annex['id']][:label]}<br/><br/>"
61
+ t << "<b>#{name.text}</b>"
62
+ end
63
+ end
64
+
65
+ def annex(isoxml, out)
66
+ isoxml.xpath(ns("//annex")).each do |c|
67
+ page_break(out)
68
+ out.div **attr_code("id": c["id"], class: "Section3" ) do |s|
69
+ #s1.div **{ class: "annex" } do |s|
70
+ c.elements.each do |c1|
71
+ if c1.name == "title" then annex_name(c, c1, s)
72
+ else
73
+ parse(c1, s)
74
+ end
75
+ end
76
+ # end
77
+ end
78
+ end
79
+ end
80
+
81
+ def scope(isoxml, out)
82
+ f = isoxml.at(ns("//clause[title = 'Scope']")) || return
83
+ out.div do |div|
84
+ clause_name("1.", "Scope", div, false)
85
+ f.elements.each do |e|
86
+ parse(e, div) unless e.name == "title"
87
+ end
88
+ end
89
+ end
90
+
91
+ def terms_defs(isoxml, out)
92
+ f = isoxml.at(ns("//terms")) || return
93
+ out.div do |div|
94
+ clause_name("3.", "Terms and Definitions", div, false)
95
+ f.elements.each do |e|
96
+ parse(e, div) unless e.name == "title"
97
+ end
98
+ end
99
+ end
100
+
101
+ def symbols_abbrevs(isoxml, out)
102
+ f = isoxml.at(ns("//symbols-abbrevs")) || return
103
+ out.div do |div|
104
+ clause_name("4.", "Symbols and Abbreviations", div, false)
105
+ f.elements.each do |e|
106
+ parse(e, div) unless e.name == "title"
107
+ end
108
+ end
109
+ end
110
+
111
+ def introduction(isoxml, out)
112
+ f = isoxml.at(ns("//content[title = 'Introduction']")) || return
113
+ title_attr = { class: "IntroTitle" }
114
+ page_break(out)
115
+ out.div **{ class: "Section3" } do |div|
116
+ div.h1 "Introduction", **attr_code(title_attr)
117
+ f.elements.each do |e|
118
+ if e.name == "patent-notice"
119
+ e.elements.each { |e1| parse(e1, div) }
120
+ else
121
+ parse(e, div) unless e.name == "title"
122
+ end
123
+ end
124
+ end
125
+ end
126
+
127
+ def foreword(isoxml, out)
128
+ f = isoxml.at(ns("//content[title = 'Foreword']")) || return
129
+ page_break(out)
130
+ out.div do |s|
131
+ s.h1 **{ class: "ForewordTitle" } { |h1| h1 << "Foreword" }
132
+ f.elements.each { |e| parse(e, s) unless e.name == "title" }
133
+ end
134
+ end
135
+ end
136
+ end