bergamasco 0.2.14 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/bergamasco.rb +1 -0
- data/lib/bergamasco/jats.csl +175 -0
- data/lib/bergamasco/jats.lua +662 -0
- data/lib/bergamasco/jats.rb +32 -0
- data/lib/bergamasco/version.rb +1 -1
- data/spec/fixtures/cool-dois.html.md +7 -7
- data/spec/fixtures/cool-dois.xml +98 -0
- data/spec/jats_spec.rb +24 -0
- data/templates/default.jats +82 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: adf586839173e90769dcbdeff4e32c4c0b4e4d5b
|
4
|
+
data.tar.gz: 73ea901886feefc53424da597aab4d3a9c32578b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 499eeac743a1c687aacafa8d481cb32963371a2684d655aaf58a312f7b8fb97f3b17f39e5d1c5bbbc30af284a728923d2d8d494c600a892270121920654dba35
|
7
|
+
data.tar.gz: d84baf3e47763d3a38d60cf37004c1084b0a311b1055447da8b4993f373e8765cea4f76f50614789b5e5d6f1e83b11b56820c60e50850a84b79444b0c78e7053
|
data/Gemfile.lock
CHANGED
data/lib/bergamasco.rb
CHANGED
@@ -0,0 +1,175 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" default-locale="en-US">
|
3
|
+
<!-- This style was edited with the Visual CSL Editor (http://editor.citationstyles.org/visualEditor/) -->
|
4
|
+
<info>
|
5
|
+
<title>Journal Article Tag Suite</title>
|
6
|
+
<title-short>JATS</title-short>
|
7
|
+
<id>http://www.zotero.org/styles/journal-article-tag-suite</id>
|
8
|
+
<link href="http://www.zotero.org/styles/journal-article-tag-suite" rel="self"/>
|
9
|
+
<link rel="documentation" href="http://jats.nlm.nih.gov/archiving/tag-library/1.0/index.html"/>
|
10
|
+
<author>
|
11
|
+
<name>Martin Fenner</name>
|
12
|
+
<email>mfenner@plos.org</email>
|
13
|
+
</author>
|
14
|
+
<category citation-format="numeric"/>
|
15
|
+
<category field="medicine"/>
|
16
|
+
<category field="biology"/>
|
17
|
+
<summary>Use this style to generate bibliographic data in Journal Article Tagging Suite (JATS) 1.0 XML format</summary>
|
18
|
+
<updated>2015-04-26T17:02:43+00:00</updated>
|
19
|
+
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
|
20
|
+
</info>
|
21
|
+
<locale xml:lang="en">
|
22
|
+
<terms>
|
23
|
+
<term name="et-al"><etal/></term>
|
24
|
+
</terms>
|
25
|
+
</locale>
|
26
|
+
<macro name="citation-label">
|
27
|
+
<text variable="citation-number" prefix="id="" suffix="">"/>
|
28
|
+
<text variable="citation-number" prefix="<label>" suffix="</label>"/>
|
29
|
+
</macro>
|
30
|
+
<macro name="author">
|
31
|
+
<names variable="author">
|
32
|
+
<name delimiter="</name><name>" prefix="<name>" suffix="</name>" name-as-sort-order="all" sort-separator="">
|
33
|
+
<name-part name="family" text-case="capitalize-first" prefix="<surname>" suffix="</surname>"/>
|
34
|
+
<name-part name="given" text-case="capitalize-first" prefix="<given-names>" suffix="</given-names>"/>
|
35
|
+
</name>
|
36
|
+
<substitute>
|
37
|
+
<names variable="editor"/>
|
38
|
+
</substitute>
|
39
|
+
</names>
|
40
|
+
</macro>
|
41
|
+
<macro name="editor">
|
42
|
+
<group delimiter=": ">
|
43
|
+
<names variable="editor">
|
44
|
+
<name delimiter="" prefix="<name>" suffix="</name>" name-as-sort-order="all" sort-separator="">
|
45
|
+
<name-part name="family" text-case="capitalize-first" prefix="<surname>" suffix="</surname>"/>
|
46
|
+
<name-part name="given" text-case="capitalize-first" prefix="<given-names>" suffix="<given-names>"/>
|
47
|
+
</name>
|
48
|
+
</names>
|
49
|
+
</group>
|
50
|
+
</macro>
|
51
|
+
<macro name="title">
|
52
|
+
<text variable="title"/>
|
53
|
+
</macro>
|
54
|
+
<macro name="container-title">
|
55
|
+
<text variable="container-title" form="short" prefix="<source>" suffix="</source>"/>
|
56
|
+
</macro>
|
57
|
+
<macro name="publisher">
|
58
|
+
<text variable="publisher" prefix="<publisher-name>" suffix="</publisher-name>"/>
|
59
|
+
<text variable="publisher-place" prefix="<publisher-loc>" suffix="</publisher-loc>"/>
|
60
|
+
</macro>
|
61
|
+
<macro name="link">
|
62
|
+
<choose>
|
63
|
+
<if match="any" variable="DOI">
|
64
|
+
<text variable="DOI" />
|
65
|
+
</if>
|
66
|
+
</choose>
|
67
|
+
<choose>
|
68
|
+
<if match="any" variable="PMID">
|
69
|
+
<text variable="PMID" />
|
70
|
+
</if>
|
71
|
+
</choose>
|
72
|
+
<choose>
|
73
|
+
<if variable="URL" match="any">
|
74
|
+
<text variable="URL" />
|
75
|
+
</if>
|
76
|
+
</choose>
|
77
|
+
</macro>
|
78
|
+
<macro name="date">
|
79
|
+
<choose>
|
80
|
+
<if type="article-journal article-magazine article-newspaper report patent" match="any">
|
81
|
+
<group prefix="<date" suffix="</date>">
|
82
|
+
<date variable="issued" prefix=" iso-8601-date="" suffix="">">
|
83
|
+
<date-part name="year" range-delimiter=""/>
|
84
|
+
<date-part name="month" form="numeric-leading-zeros" range-delimiter="" prefix="-"/>
|
85
|
+
<date-part name="day" form="numeric-leading-zeros" range-delimiter="" prefix="-"/>
|
86
|
+
</date>
|
87
|
+
<date variable="issued">
|
88
|
+
<date-part name="day" form="numeric-leading-zeros" prefix="<day>" suffix="</day>"/>
|
89
|
+
<date-part name="month" form="numeric-leading-zeros" prefix="<month>" suffix="</month>"/>
|
90
|
+
<date-part name="year" prefix="<year>" suffix="</year>"/>
|
91
|
+
</date>
|
92
|
+
</group>
|
93
|
+
</if>
|
94
|
+
<else>
|
95
|
+
<group prefix="<date-in-citation content-type="access-date"" suffix="</date-in-citation>">
|
96
|
+
<date variable="accessed" prefix=" iso-8601-date="" suffix="">">
|
97
|
+
<date-part name="year"/>
|
98
|
+
<date-part name="month" form="numeric-leading-zeros" prefix="-"/>
|
99
|
+
<date-part name="day" form="numeric-leading-zeros" prefix="-"/>
|
100
|
+
</date>
|
101
|
+
<date variable="accessed">
|
102
|
+
<date-part name="day" prefix="<day>" suffix="</day>"/>
|
103
|
+
<date-part name="month" form="numeric-leading-zeros" prefix="<month>" suffix="</month>"/>
|
104
|
+
<date-part name="year" prefix="<year>" suffix="</year>"/>
|
105
|
+
</date>
|
106
|
+
</group>
|
107
|
+
</else>
|
108
|
+
</choose>
|
109
|
+
</macro>
|
110
|
+
<macro name="location">
|
111
|
+
<choose>
|
112
|
+
<if type="article-journal article-magazine" match="any">
|
113
|
+
<text variable="volume" prefix="<volume>" suffix="</volume>"/>
|
114
|
+
<text variable="issue" prefix="<issue>" suffix="</issue>"/>
|
115
|
+
</if>
|
116
|
+
</choose>
|
117
|
+
<choose>
|
118
|
+
<if type="article-journal article-magazine article-newspaper" match="any">
|
119
|
+
<text variable="page-first" prefix="<fpage>" suffix="</fpage>"/>
|
120
|
+
</if>
|
121
|
+
</choose>
|
122
|
+
</macro>
|
123
|
+
<macro name="publication-type">
|
124
|
+
<group prefix=" publication-type="" suffix="">">
|
125
|
+
<choose>
|
126
|
+
<if type="article-journal article-magazine article-newspaper" match="any">
|
127
|
+
<text value="journal"/>
|
128
|
+
</if>
|
129
|
+
<else-if type="book" match="any">
|
130
|
+
<text value="book"/>
|
131
|
+
</else-if>
|
132
|
+
<else-if type="dataset" match="any">
|
133
|
+
<text value="dataset"/>
|
134
|
+
</else-if>
|
135
|
+
<else-if type="patent" match="any">
|
136
|
+
<text value="patent"/>
|
137
|
+
</else-if>
|
138
|
+
<else-if type="report" match="any">
|
139
|
+
<text value="report"/>
|
140
|
+
</else-if>
|
141
|
+
<else-if type="review" match="any">
|
142
|
+
<text value="review"/>
|
143
|
+
</else-if>
|
144
|
+
<else>
|
145
|
+
<text value="standard"/>
|
146
|
+
</else>
|
147
|
+
</choose>
|
148
|
+
</group>
|
149
|
+
</macro>
|
150
|
+
<citation collapse="citation-number">
|
151
|
+
<sort>
|
152
|
+
<key variable="citation-number"/>
|
153
|
+
</sort>
|
154
|
+
<layout delimiter=",">
|
155
|
+
<text variable="citation-number"/>
|
156
|
+
</layout>
|
157
|
+
</citation>
|
158
|
+
<bibliography sort-separator="">
|
159
|
+
<layout>
|
160
|
+
<group prefix="<ref " suffix="</ref>">
|
161
|
+
<text macro="citation-label" suffix=" "/>
|
162
|
+
<group prefix="<element-citation" suffix="</element-citation> ">
|
163
|
+
<text macro="publication-type"/>
|
164
|
+
<text macro="author" prefix="<person-group person-group-type="author">" suffix="</person-group>"/>
|
165
|
+
<text macro="title" prefix="<article-title>" suffix="</article-title>"/>
|
166
|
+
<text macro="container-title"/>
|
167
|
+
<text macro="publisher"/>
|
168
|
+
<text macro="date"/>
|
169
|
+
<text macro="location"/>
|
170
|
+
<text macro="link"/>
|
171
|
+
</group>
|
172
|
+
</group>
|
173
|
+
</layout>
|
174
|
+
</bibliography>
|
175
|
+
</style>
|
@@ -0,0 +1,662 @@
|
|
1
|
+
-- This is a JATS custom writer for pandoc. It produces output
|
2
|
+
-- that tries to conform to the JATS 1.0 specification
|
3
|
+
-- http://jats.nlm.nih.gov/archiving/tag-library/1.0/index.html
|
4
|
+
--
|
5
|
+
-- Invoke with: pandoc -t jats.lua
|
6
|
+
--
|
7
|
+
-- Note: you need not have lua installed on your system to use this
|
8
|
+
-- custom writer. However, if you do have lua installed, you can
|
9
|
+
-- use it to test changes to the script. 'lua JATS.lua' will
|
10
|
+
-- produce informative error messages if your code contains
|
11
|
+
-- syntax errors.
|
12
|
+
--
|
13
|
+
-- Released under the GPL, version 2 or greater. See LICENSE for more info.
|
14
|
+
|
15
|
+
-- Tables to store metadata, headers, sections, back sections, references, figures and footnotes
|
16
|
+
local meta = {}
|
17
|
+
local headers = {}
|
18
|
+
local sections = {}
|
19
|
+
local back = {}
|
20
|
+
local references = {}
|
21
|
+
local figures = {}
|
22
|
+
|
23
|
+
-- This function is called once for the whole document. Parameters:
|
24
|
+
-- body is a string, metadata is a table, variables is a table.
|
25
|
+
-- This gives you a fragment. You could use the metadata table to
|
26
|
+
-- fill variables in a custom lua template. Or, pass `--template=...`
|
27
|
+
-- to pandoc, and pandoc will do the template processing as
|
28
|
+
-- usual.
|
29
|
+
function Doc(body, metadata, variables)
|
30
|
+
meta = metadata or {}
|
31
|
+
|
32
|
+
-- if document doesn't start with section, add top-level section without title
|
33
|
+
if string.sub(body, 1, 6) ~= '</sec>' then
|
34
|
+
body = Header(1, '') .. '\n' .. body
|
35
|
+
end
|
36
|
+
|
37
|
+
-- strip closing section tag from beginning, add to end of document
|
38
|
+
body = string.sub(body, 7) .. '</sec>'
|
39
|
+
|
40
|
+
-- parse sections, turn body into table of sections
|
41
|
+
for lev, title, content in string.gmatch(body, '<sec.-lev="(.-)".->%s<title>(.-)</title>(.-)</sec>') do
|
42
|
+
attr = section_helper(tonumber(lev), content, title)
|
43
|
+
end
|
44
|
+
|
45
|
+
body = xml('body', '\n' .. table.concat(sections, '\n') .. '\n')
|
46
|
+
|
47
|
+
if #back > 0 then
|
48
|
+
body = body .. '\n' .. xml('back', '\n' .. table.concat(back, '\n'))
|
49
|
+
end
|
50
|
+
|
51
|
+
return body
|
52
|
+
end
|
53
|
+
|
54
|
+
-- XML character entity escaping and unescaping
|
55
|
+
function escape(s)
|
56
|
+
local map = { ['<'] = '<',
|
57
|
+
['>'] = '>',
|
58
|
+
['&'] = '&',
|
59
|
+
['"'] = '"',
|
60
|
+
['\'']= ''' }
|
61
|
+
return s:gsub("[<>&\"']", function(x) return map[x] end)
|
62
|
+
end
|
63
|
+
|
64
|
+
function unescape(s)
|
65
|
+
local map = { ['<'] = '<',
|
66
|
+
['>'] = '>',
|
67
|
+
['&'] = '&',
|
68
|
+
['"'] = '"',
|
69
|
+
[''']= '\'' }
|
70
|
+
return s:gsub('(&(#?)([%d%a]+);)', function(x) return map[x] end)
|
71
|
+
end
|
72
|
+
|
73
|
+
-- Helper function to convert an attributes table into
|
74
|
+
-- a string that can be put into XML elements.
|
75
|
+
function attributes(attr)
|
76
|
+
local attr_table = {}
|
77
|
+
for x, y in pairsByKeys(attr) do
|
78
|
+
if y and y ~= '' then
|
79
|
+
table.insert(attr_table, string.format(' %s="%s"', x, escape(y)))
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return table.concat(attr_table)
|
83
|
+
end
|
84
|
+
|
85
|
+
-- sort table, so that attributes are in consistent order
|
86
|
+
function pairsByKeys (t, f)
|
87
|
+
local a = {}
|
88
|
+
for n in pairs(t) do table.insert(a, n) end
|
89
|
+
table.sort(a, f)
|
90
|
+
local i = 0 -- iterator variable
|
91
|
+
local iter = function () -- iterator function
|
92
|
+
i = i + 1
|
93
|
+
if a[i] == nil then return nil
|
94
|
+
else return a[i], t[a[i]]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return iter
|
98
|
+
end
|
99
|
+
|
100
|
+
-- generic xml builder
|
101
|
+
function xml(tag, s, attr)
|
102
|
+
attr = attr and attributes(attr) or ''
|
103
|
+
s = s and '>' .. s .. '</' .. tag .. '>' or '/>'
|
104
|
+
return '<' .. tag .. attr .. s
|
105
|
+
end
|
106
|
+
|
107
|
+
-- Flatten nested table, needed for nested YAML metadata['
|
108
|
+
-- We only flatten associative arrays and create composite key,
|
109
|
+
-- numbered arrays and flat tables are left intact.
|
110
|
+
-- We also convert all hyphens in keys to underscores,
|
111
|
+
-- so that they are proper variable names
|
112
|
+
function flatten_table(tbl)
|
113
|
+
local result = {}
|
114
|
+
|
115
|
+
local function flatten(tbl, key)
|
116
|
+
for k, v in pairs(tbl) do
|
117
|
+
if type(k) == 'number' and k > 0 and k <= #tbl then
|
118
|
+
result[key] = tbl
|
119
|
+
break
|
120
|
+
else
|
121
|
+
k = (key and key .. '-' or '') .. k
|
122
|
+
if type(v) == 'table' then
|
123
|
+
flatten(v, k)
|
124
|
+
else
|
125
|
+
result[k] = v
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
flatten(tbl)
|
132
|
+
return result
|
133
|
+
end
|
134
|
+
|
135
|
+
-- Read a file from the working directory and
|
136
|
+
-- return its contents (or nil if not found).
|
137
|
+
function read_file(name)
|
138
|
+
local base, ext = name:match("([^%.]*)(.*)")
|
139
|
+
local fname = base .. ext
|
140
|
+
local file = io.open(fname, "r")
|
141
|
+
if not file then return nil end
|
142
|
+
return file:read("*all")
|
143
|
+
end
|
144
|
+
|
145
|
+
-- Parse YAML string and return table.
|
146
|
+
-- We only understand a subset.
|
147
|
+
function parse_yaml(s)
|
148
|
+
local l = {}
|
149
|
+
local c = {}
|
150
|
+
local i = 0
|
151
|
+
local k = nil
|
152
|
+
|
153
|
+
-- patterns
|
154
|
+
line_pattern = '(.-)\r?\n'
|
155
|
+
config_pattern = '^(%s*)([%w%-]+):%s*(.-)$'
|
156
|
+
|
157
|
+
-- First split string into lines
|
158
|
+
local function lines(line)
|
159
|
+
table.insert(l, line)
|
160
|
+
return ""
|
161
|
+
end
|
162
|
+
|
163
|
+
lines((s:gsub(line_pattern, lines)))
|
164
|
+
|
165
|
+
-- Then go over each line and check value and indentation
|
166
|
+
for _, v in ipairs(l) do
|
167
|
+
v:gsub(config_pattern, function(indent, tag, v)
|
168
|
+
if (v == '') then
|
169
|
+
i, k = string.len(indent), tag
|
170
|
+
c[tag] = {}
|
171
|
+
else
|
172
|
+
-- check whether value is enclosed by brackets, i.e. an array
|
173
|
+
if v:find('^%[(.-)%]$') then
|
174
|
+
arr = {};
|
175
|
+
for match in (v:sub(2, -2) .. ','):gmatch('(.-)' .. ',%s*') do
|
176
|
+
table.insert(arr, match);
|
177
|
+
end
|
178
|
+
v = arr;
|
179
|
+
else
|
180
|
+
-- if it is a string, remove optional enclosing quotes
|
181
|
+
v = v:match('^["\']*(.-)["\']*$')
|
182
|
+
end
|
183
|
+
|
184
|
+
if string.len(indent) == i + 2 and k then
|
185
|
+
c[k][tag] = v
|
186
|
+
else
|
187
|
+
c[tag] = v
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end)
|
191
|
+
end
|
192
|
+
|
193
|
+
return c
|
194
|
+
end
|
195
|
+
|
196
|
+
-- add appropriate sec-type attribute
|
197
|
+
function sec_type_helper(s)
|
198
|
+
local map = { ['Abstract']= 'abstract',
|
199
|
+
['Acknowledgments']= 'acknowledgements',
|
200
|
+
['Author Summary']= 'author-summary',
|
201
|
+
['Conclusions'] = 'conclusions',
|
202
|
+
['Discussion'] = 'discussion',
|
203
|
+
['Glossary'] = 'glossary',
|
204
|
+
['Introduction'] = 'intro',
|
205
|
+
['Materials and Methods'] = 'materials|methods',
|
206
|
+
['Notes'] = 'notes',
|
207
|
+
['References']= 'references',
|
208
|
+
['Results']= 'results',
|
209
|
+
['Supporting Information']= 'supplementary-material',
|
210
|
+
['Supplementary Information']= 'supplementary-material' }
|
211
|
+
return map[s]
|
212
|
+
end
|
213
|
+
|
214
|
+
function section_helper(lev, s, title)
|
215
|
+
local attr = { ['sec-type'] = sec_type_helper(title) }
|
216
|
+
|
217
|
+
if attr['sec-type'] == "acknowledgements" then
|
218
|
+
table.insert(back, Ack(s, title))
|
219
|
+
elseif attr['sec-type'] == "references" then
|
220
|
+
table.insert(back, RefList(s, title))
|
221
|
+
elseif attr['sec-type'] == "notes" then
|
222
|
+
table.insert(back, Note(s, title))
|
223
|
+
elseif attr['sec-type'] == "glossary" then
|
224
|
+
table.insert(back, Glossary(s, title))
|
225
|
+
elseif attr['sec-type'] == "abstract" or attr['sec-type'] == "author-summary" then
|
226
|
+
-- discard, should be provided via metadata
|
227
|
+
elseif attr['sec-type'] == "supplementary-material" then
|
228
|
+
table.insert(sections, SupplementaryMaterial(s, title))
|
229
|
+
else
|
230
|
+
table.insert(sections, Section(lev, s, title, attr))
|
231
|
+
end
|
232
|
+
|
233
|
+
return attr
|
234
|
+
end
|
235
|
+
|
236
|
+
-- Create table with year, month, day and iso8601-formatted date
|
237
|
+
-- Input is iso8601-formatted date as string
|
238
|
+
-- Return nil if input is not a valid date
|
239
|
+
function date_helper(iso_date)
|
240
|
+
if not iso_date or string.len(iso_date) ~= 10 then return nil end
|
241
|
+
|
242
|
+
_,_,y,m,d = string.find(iso_date, '(%d+)-(%d+)-(%d+)')
|
243
|
+
time = os.time({ year = y, month = m, day = d })
|
244
|
+
date = os.date('*t', time)
|
245
|
+
date.iso8601 = string.format('%04d-%02d-%02d', date.year, date.month, date.day)
|
246
|
+
return date
|
247
|
+
end
|
248
|
+
|
249
|
+
-- Create affiliation table, linked to authors via aff-id
|
250
|
+
function affiliation_helper(tbl)
|
251
|
+
|
252
|
+
set = {}
|
253
|
+
i = 0
|
254
|
+
for _,author in ipairs(tbl.author) do
|
255
|
+
if author.affiliation then
|
256
|
+
if not set[author.affiliation] then
|
257
|
+
i = i + 1
|
258
|
+
set[author.affiliation] = i
|
259
|
+
end
|
260
|
+
author['aff-id'] = set[author.affiliation]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
tbl.aff = {}
|
265
|
+
for k,v in pairs(set) do
|
266
|
+
aff = { id = v, name = k }
|
267
|
+
table.insert(tbl.aff, aff)
|
268
|
+
end
|
269
|
+
|
270
|
+
return tbl
|
271
|
+
end
|
272
|
+
|
273
|
+
-- Create corresponding author table, linked to authors via cor-id
|
274
|
+
function corresp_helper(tbl)
|
275
|
+
|
276
|
+
set = {}
|
277
|
+
i = 0
|
278
|
+
for _,author in ipairs(tbl.author) do
|
279
|
+
if author.corresp and author.email then
|
280
|
+
i = i + 1
|
281
|
+
set[i] = author.email
|
282
|
+
author['cor-id'] = i
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
tbl.corresp = {}
|
287
|
+
for k,v in pairs(set) do
|
288
|
+
corresp = { id = k, email = v }
|
289
|
+
table.insert(tbl.corresp, corresp)
|
290
|
+
end
|
291
|
+
|
292
|
+
return tbl
|
293
|
+
end
|
294
|
+
|
295
|
+
-- temporary fix
|
296
|
+
function fix_citeproc(s)
|
297
|
+
s = s:gsub('</surname>, ', '</surname>')
|
298
|
+
s = s:gsub('</name></name><name>','</name>')
|
299
|
+
return s
|
300
|
+
end
|
301
|
+
|
302
|
+
-- Convert pandoc alignment to something HTML can use.
|
303
|
+
-- align is AlignLeft, AlignRight, AlignCenter, or AlignDefault.
|
304
|
+
function html_align(align)
|
305
|
+
local map = { ['AlignRight']= 'right',
|
306
|
+
['AlignCenter']= 'center' }
|
307
|
+
return map[align] or 'left'
|
308
|
+
end
|
309
|
+
|
310
|
+
-- Blocksep is used to separate block elements.
|
311
|
+
function Blocksep()
|
312
|
+
return "\n"
|
313
|
+
end
|
314
|
+
|
315
|
+
-- The functions that follow render corresponding pandoc elements.
|
316
|
+
-- s is always a string, attr is always a table of attributes, and
|
317
|
+
-- items is always an array of strings (the items in a list).
|
318
|
+
-- Comments indicate the types of other variables.
|
319
|
+
-- Defined at https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Writers/Custom.hs
|
320
|
+
|
321
|
+
-- block elements
|
322
|
+
|
323
|
+
function Plain(s)
|
324
|
+
return s
|
325
|
+
end
|
326
|
+
|
327
|
+
function Para(s)
|
328
|
+
return xml('p', s)
|
329
|
+
end
|
330
|
+
|
331
|
+
function RawBlock(s)
|
332
|
+
return xml('preformat', s)
|
333
|
+
end
|
334
|
+
|
335
|
+
-- JATS restricts use to inside table cells (<td> and <th>)
|
336
|
+
function HorizontalRule()
|
337
|
+
return '<hr/>'
|
338
|
+
end
|
339
|
+
|
340
|
+
-- lev is an integer, the header level.
|
341
|
+
-- we can't use closing tags, as we don't know the end of the section
|
342
|
+
function Header(lev, s, attr)
|
343
|
+
attr = attr or {}
|
344
|
+
attr['lev'] = '' .. lev
|
345
|
+
return '</sec>\n<sec' .. attributes(attr) .. '>\n' .. xml('title', s)
|
346
|
+
end
|
347
|
+
|
348
|
+
function Note(s)
|
349
|
+
return s
|
350
|
+
end
|
351
|
+
|
352
|
+
function CodeBlock(s, attr)
|
353
|
+
-- If code block has class 'dot', pipe the contents through dot
|
354
|
+
-- and base64, and include the base64-encoded png as a data: URL.
|
355
|
+
if attr.class and string.match(' ' .. attr.class .. ' ',' dot ') then
|
356
|
+
local png = pipe("base64", pipe("dot -Tpng", s))
|
357
|
+
return '<img src="data:image/png;base64,' .. png .. '"/>'
|
358
|
+
-- otherwise treat as code (one could pipe through a highlighter)
|
359
|
+
else
|
360
|
+
return "<pre><code" .. attributes(attr) .. ">" .. escape(s) ..
|
361
|
+
"</code></pre>"
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
function BlockQuote(s)
|
366
|
+
xml('boxed-text', s)
|
367
|
+
end
|
368
|
+
|
369
|
+
-- Caption is a string, aligns is an array of strings,
|
370
|
+
-- widths is an array of floats, headers is an array of
|
371
|
+
-- strings, rows is an array of arrays of strings.
|
372
|
+
function Table(caption, aligns, widths, headers, rows)
|
373
|
+
local buffer = {}
|
374
|
+
local function add(s)
|
375
|
+
table.insert(buffer, s)
|
376
|
+
end
|
377
|
+
table.insert(buffer, '<table-wrap>')
|
378
|
+
if caption ~= '' then
|
379
|
+
-- if caption begins with <bold> text, make it the <title>
|
380
|
+
caption = string.gsub('<p>' .. caption, "^<p><bold>(.-)</bold>%s", "<title>%1</title>\n<p>")
|
381
|
+
add(xml('caption>', caption))
|
382
|
+
end
|
383
|
+
add("<table>")
|
384
|
+
if widths and widths[1] ~= 0 then
|
385
|
+
for _, w in pairs(widths) do
|
386
|
+
add('<col width="' .. string.format("%d%%", w * 100) .. '" />')
|
387
|
+
end
|
388
|
+
end
|
389
|
+
local header_row = {}
|
390
|
+
local empty_header = true
|
391
|
+
for i, h in pairs(headers) do
|
392
|
+
local align = html_align(aligns[i])
|
393
|
+
|
394
|
+
-- remove <p> tag
|
395
|
+
h = h:gsub("^<p>(.-)</p>", "%1")
|
396
|
+
|
397
|
+
table.insert(header_row,'<th align="' .. align .. '">' .. h .. '</th>')
|
398
|
+
empty_header = empty_header and h == ""
|
399
|
+
end
|
400
|
+
if empty_header then
|
401
|
+
head = ""
|
402
|
+
else
|
403
|
+
add('<tr>')
|
404
|
+
for _,h in pairs(header_row) do
|
405
|
+
add(h)
|
406
|
+
end
|
407
|
+
add('</tr>')
|
408
|
+
end
|
409
|
+
for _, row in pairs(rows) do
|
410
|
+
add('<tr>')
|
411
|
+
for i,c in pairs(row) do
|
412
|
+
-- remove <p> tag
|
413
|
+
c = c:gsub("^<p>(.-)</p>", "%1")
|
414
|
+
add('<td align="' .. html_align(aligns[i]) .. '">' .. c .. '</td>')
|
415
|
+
end
|
416
|
+
add('</tr>')
|
417
|
+
end
|
418
|
+
add('</table>\n</table-wrap>')
|
419
|
+
return table.concat(buffer,'\n')
|
420
|
+
end
|
421
|
+
|
422
|
+
function BulletList(items)
|
423
|
+
local attr = { ['list-type'] = 'bullet' }
|
424
|
+
return List(items, attr)
|
425
|
+
end
|
426
|
+
|
427
|
+
function OrderedList(items)
|
428
|
+
local attr = { ['list-type'] = 'order' }
|
429
|
+
return List(items, attr)
|
430
|
+
end
|
431
|
+
|
432
|
+
function List(items, attr)
|
433
|
+
local buffer = {}
|
434
|
+
for _, item in pairs(items) do
|
435
|
+
table.insert(buffer, xml('list-item', item))
|
436
|
+
end
|
437
|
+
return xml('list', '\n' .. table.concat(buffer, '\n') .. '\n', attr)
|
438
|
+
end
|
439
|
+
|
440
|
+
-- Revisit association list StackValue instance.
|
441
|
+
-- items is a table of tables
|
442
|
+
function DefinitionList(items)
|
443
|
+
local buffer = {}
|
444
|
+
for _,item in pairs(items) do
|
445
|
+
for k, v in pairs(item) do
|
446
|
+
local term = xml('term', k)
|
447
|
+
local def = xml('def', table.concat(v,'</def><def>'))
|
448
|
+
table.insert(buffer, xml('def-item', term .. def))
|
449
|
+
end
|
450
|
+
end
|
451
|
+
return xml('def-list', '\n' .. table.concat(buffer, '\n') .. '\n')
|
452
|
+
end
|
453
|
+
|
454
|
+
function Div(s, attr)
|
455
|
+
return s
|
456
|
+
end
|
457
|
+
|
458
|
+
-- custom block elements for JATS
|
459
|
+
|
460
|
+
-- section is generated after header to allow reordering
|
461
|
+
function Section(lev, s, title, attr)
|
462
|
+
local last = headers[#headers]
|
463
|
+
local h = last and last.h or {}
|
464
|
+
h[lev] = (h[lev] or 0) + 1
|
465
|
+
for i = lev + 1, #headers do
|
466
|
+
table.remove(h, i)
|
467
|
+
end
|
468
|
+
|
469
|
+
local header = { ['h'] = h,
|
470
|
+
['title'] = title,
|
471
|
+
['id'] = 'sec-' .. table.concat(h,'.'),
|
472
|
+
['sec-type'] = attr['sec-type'] }
|
473
|
+
|
474
|
+
table.insert(headers, header)
|
475
|
+
|
476
|
+
attr = { ['id'] = header['id'], ['sec-type'] = header['sec-type'] }
|
477
|
+
title = xml('title', title ~= '' and title or nil)
|
478
|
+
return xml('sec', '\n' .. title .. s, attr)
|
479
|
+
end
|
480
|
+
|
481
|
+
function SupplementaryMaterial(s, title, attr)
|
482
|
+
attr = {}
|
483
|
+
title = xml('title', title)
|
484
|
+
local caption = xml('caption', title .. s)
|
485
|
+
return xml('supplementary-material', '\n' .. caption .. '\n', attr)
|
486
|
+
end
|
487
|
+
|
488
|
+
function Ack(s, title)
|
489
|
+
title = title and '\n' .. xml('title', title) or ''
|
490
|
+
return xml('ack', title .. s)
|
491
|
+
end
|
492
|
+
|
493
|
+
function Glossary(s, title, attr)
|
494
|
+
title = xml('title', title)
|
495
|
+
return xml('glossary', title .. s, attr)
|
496
|
+
end
|
497
|
+
|
498
|
+
function RefList(s, title)
|
499
|
+
s = fix_citeproc(s)
|
500
|
+
|
501
|
+
-- format ids
|
502
|
+
s = string.gsub(s, '<ref id="(%d+)">', function (r)
|
503
|
+
local attr = { ['id'] = string.format('r%03d', tonumber(r)) }
|
504
|
+
return '<ref ' .. attributes(attr) .. '>'
|
505
|
+
end)
|
506
|
+
|
507
|
+
for ref in string.gmatch(s, '(<ref.-</ref>)') do
|
508
|
+
Ref(ref)
|
509
|
+
end
|
510
|
+
|
511
|
+
if #references > 0 then
|
512
|
+
title = xml('title', title)
|
513
|
+
return xml('ref-list', title .. table.concat(references, '\n'), attr)
|
514
|
+
else
|
515
|
+
return ''
|
516
|
+
end
|
517
|
+
end
|
518
|
+
|
519
|
+
function Ref(s)
|
520
|
+
table.insert(references, s)
|
521
|
+
return #references
|
522
|
+
end
|
523
|
+
|
524
|
+
-- inline elements
|
525
|
+
|
526
|
+
function Str(s)
|
527
|
+
return s
|
528
|
+
end
|
529
|
+
|
530
|
+
function Space()
|
531
|
+
return ' '
|
532
|
+
end
|
533
|
+
|
534
|
+
function Emph(s)
|
535
|
+
return xml('italic', s)
|
536
|
+
end
|
537
|
+
|
538
|
+
function Strong(s)
|
539
|
+
return xml('bold', s)
|
540
|
+
end
|
541
|
+
|
542
|
+
function Strikeout(s)
|
543
|
+
return xml('strike', s)
|
544
|
+
end
|
545
|
+
|
546
|
+
function Superscript(s)
|
547
|
+
return xml('sup', s)
|
548
|
+
end
|
549
|
+
|
550
|
+
function Subscript(s)
|
551
|
+
return xml('sub', s)
|
552
|
+
end
|
553
|
+
|
554
|
+
function SmallCaps(s)
|
555
|
+
return xml('sc', s)
|
556
|
+
end
|
557
|
+
|
558
|
+
function SingleQuoted(s)
|
559
|
+
return "'" .. s .. "'"
|
560
|
+
end
|
561
|
+
|
562
|
+
function DoubleQuoted(s)
|
563
|
+
return '"' .. s .. '"'
|
564
|
+
end
|
565
|
+
|
566
|
+
-- format in-text citation
|
567
|
+
function Cite(s)
|
568
|
+
local ids = {}
|
569
|
+
for id in string.gmatch(s, '(%d+)') do
|
570
|
+
id = tonumber(id)
|
571
|
+
-- workaround to discard year mistakenly taken for key
|
572
|
+
if id and id < 1000 then
|
573
|
+
local attr = { ['ref-type'] = 'bibr',
|
574
|
+
['rid'] = string.format("r%03d", id) }
|
575
|
+
table.insert(ids, xml('xref', '[' .. id .. ']', attr))
|
576
|
+
end
|
577
|
+
end
|
578
|
+
if #ids > 0 then
|
579
|
+
return table.concat(ids)
|
580
|
+
else
|
581
|
+
-- return original key for backwards compatibility
|
582
|
+
return s
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
function Code(s, attr)
|
587
|
+
return xml('preformat', s, attr)
|
588
|
+
end
|
589
|
+
|
590
|
+
function DisplayMath(s)
|
591
|
+
return xml('disp-formula', s)
|
592
|
+
end
|
593
|
+
|
594
|
+
function InlineMath(s)
|
595
|
+
return xml('inline-formula', s)
|
596
|
+
end
|
597
|
+
|
598
|
+
function RawInline(s)
|
599
|
+
return xml('preformat', s)
|
600
|
+
end
|
601
|
+
|
602
|
+
function LineBreak()
|
603
|
+
return ' '
|
604
|
+
end
|
605
|
+
|
606
|
+
function Link(s, src, title)
|
607
|
+
if src ~= '' and s ~= '' then
|
608
|
+
attr = { ['ext-link-type'] = 'uri',
|
609
|
+
['xlink:href'] = escape(src),
|
610
|
+
['xlink:title'] = escape(title),
|
611
|
+
['xlink:type'] = 'simple' }
|
612
|
+
|
613
|
+
return xml('ext-link', escape(s), attr)
|
614
|
+
else
|
615
|
+
return s
|
616
|
+
end
|
617
|
+
end
|
618
|
+
|
619
|
+
function CaptionedImage(s, src, title)
|
620
|
+
-- if title begins with <bold> text, make it the <title>
|
621
|
+
title = string.gsub(title, "^<bold>(.-)</bold>%s", function(t) xml('title', t) end)
|
622
|
+
local num = #figures + 1
|
623
|
+
local attr = { ['id'] = string.format("g%03d", num) }
|
624
|
+
local caption = xml('caption', s)
|
625
|
+
local fig = xml('fig', caption .. Image(nil, src, title), attr)
|
626
|
+
|
627
|
+
table.insert(figures, fig)
|
628
|
+
return fig
|
629
|
+
end
|
630
|
+
|
631
|
+
function Image(s, src, title)
|
632
|
+
local attr = { ['mimetype'] = 'image',
|
633
|
+
['xlink:href'] = escape(src),
|
634
|
+
['xlink:title'] = escape(title),
|
635
|
+
['xlink:type'] = 'simple' }
|
636
|
+
|
637
|
+
return xml('graphic', s, attr)
|
638
|
+
end
|
639
|
+
|
640
|
+
-- handle bold and italic
|
641
|
+
function Span(s, attr)
|
642
|
+
if attr.style == "font-weight:bold" then
|
643
|
+
return Strong(s)
|
644
|
+
elseif attr.style == "font-style:italic" then
|
645
|
+
return Emph(s)
|
646
|
+
elseif attr.style == "font-variant: small-caps" then
|
647
|
+
return SmallCaps(s)
|
648
|
+
else
|
649
|
+
return s
|
650
|
+
end
|
651
|
+
end
|
652
|
+
|
653
|
+
-- The following code will produce runtime warnings when you haven't defined
|
654
|
+
-- all of the functions you need for the custom writer, so it's useful
|
655
|
+
-- to include when you're working on a writer.
|
656
|
+
local meta = {}
|
657
|
+
meta.__index =
|
658
|
+
function(_, key)
|
659
|
+
io.stderr:write(string.format("WARNING: Undefined function '%s'\n",key))
|
660
|
+
return function() return "" end
|
661
|
+
end
|
662
|
+
setmetatable(_G, meta)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Bergamasco
|
2
|
+
module Jats
|
3
|
+
|
4
|
+
def self.render_jats(text, options={})
|
5
|
+
options = options.merge(template: "templates/default.jats",
|
6
|
+
to: "lib/bergamasco/jats.lua",
|
7
|
+
csl: "lib/bergamasco/jats.csl")
|
8
|
+
options = options.merge(metadata: options[:metadata]) if options[:metadata].present?
|
9
|
+
converter = PandocRuby.new(text, options.except(:skip_yaml_header,
|
10
|
+
:separator,
|
11
|
+
:sitepath,
|
12
|
+
:authorpath,
|
13
|
+
:referencespath,
|
14
|
+
:username,
|
15
|
+
:password,
|
16
|
+
:sandbox,
|
17
|
+
:prefix))
|
18
|
+
converter.convert
|
19
|
+
rescue Errno::ENOENT
|
20
|
+
# if pandoc is not installed.
|
21
|
+
puts "Pandoc is not installed"
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.write_jats(filepath, options={})
|
25
|
+
file = IO.read(filepath)
|
26
|
+
xml_path = File.join(File.dirname(filepath), File.basename(filepath, ".html.md")) + ".xml"
|
27
|
+
xml = render_jats(file, options)
|
28
|
+
IO.write(xml_path, xml)
|
29
|
+
xml_path
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/bergamasco/version.rb
CHANGED
@@ -15,9 +15,9 @@ Cool URIs are, of course, a fundamental principle behind DOIs, with the two impo
|
|
15
15
|
|
16
16
|
All DOIs, expressed as HTTP URI, are therefore cool URIs. So what is a cool DOI? And, furthermore, how to create and use them? To understand what a cool DOI is, we have to explain the three parts that make up a DOI:
|
17
17
|
|
18
|
-
![](
|
18
|
+
![](images/2016/12/doi-parts.png)
|
19
19
|
|
20
|
-
|
20
|
+
## Proxy
|
21
21
|
|
22
22
|
The proxy is not part of the DOI specification, but almost all scholarly DOIs that users encounter today will be expressed as HTTP URLs. DataCite recommends that all DOIs are displayed as permanent URLs, consistent with the recommendations of other DOI registration agencies, e.g. the [Crossref DOI display guidelines](http://www.crossref.org/02publishers/doi_display_guidelines.html). When the DOI system was originally designed, it was thought that the DOI protocol would become widely used, but that clearly has not happened and displaying DOIs as **doi:10.5281/ZENODO.31780** is therefore not recommended.
|
23
23
|
|
@@ -30,13 +30,13 @@ Ed Pentz from Crossref makes the case for HTTPS in a [September blog post](http:
|
|
30
30
|
|
31
31
|
What many users don’t know is that doi.org is not the only proxy server for DOIs. DOIs use the handle system and any handle server will resolve a DOI, just as doi.org will resolve any handle. This means that [https://hdl.handle.net/10.5281/ZENODO.31780](https://hdl.handle.net/10.5281/ZENODO.31780) will resolve to the landing page for that DOI and that [http://doi.org/10273/BGRB5054RX05201](http://doi.org/10273/BGRB5054RX05201) is a handle (for a [IGSN](http://www.igsn.org/)) and not a DOI.
|
32
32
|
|
33
|
-
|
33
|
+
## Prefix
|
34
34
|
|
35
35
|
The DOI prefix is used as a namespace so that DOIs are globally unique without requiring global coordination for every new identifier. Prefixes in the handle system and therefore for DOIs are numbers without any semantic meaning. One lesson learned with persistent identifiers is that adding meaning to the identifier (e.g. by using a prefix with the name of the data repository) is always dangerous, because – despite best intentions – all names can change over time.
|
36
36
|
|
37
37
|
Since the DOI prefix is a namespace to keep DOIs globally unique, there is usually no need for multiple prefixes for one organization managing DOI assignment. The tricky part is that these responsibilities can change, e.g. when an organization manages multiple repositories and one of them is migrated to another organization. It therefore makes sense to assign one prefix per list of resources that always stays together, e.g. one repository. It is possible that one prefix is managed by multiple organizations (as long as they use the same DOI registration agency), but that makes DOI management more complex.
|
38
38
|
|
39
|
-
|
39
|
+
## Suffix
|
40
40
|
|
41
41
|
The suffix for a DOI can be (almost) any string. Which is both a feature and a curse. It is a feature because it gives maximal flexibility, for example when migrating existing identifiers to the DOI system. And it is a curse because it not always works well in the web context, as the list of characters allowed in a URL is limited. A good example of this are SICIs ([Serial Item and Contribution Identifier](https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier)), they were defined in 1996 before the DOI system was implemented, and could then be migrated to DOIs. Unfortunately they can contain many characters that are problematic in a URL or make it difficult to validate the DOI, as in [https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7<672::aid-jpp192>3.0.co;2-8](https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7<672::aid-jpp192>3.0.co;2-8). A Crossref [blog post](http://blog.crossref.org/2015/08/doi-regular-expressions.html) by Andrew Gilmartin gives a good overview about the characters found in DOIs and suggests the following regular expression to check for valid DOIs:
|
42
42
|
|
@@ -53,7 +53,7 @@ Semantic information might also lead users to expect certain functionalities. A
|
|
53
53
|
|
54
54
|
Another issue to keep in mind when assigning suffixes is that DOIs – in contrast to HTTP URIs – are case-insensitive, [https://doi.org/10.5281/ZENODO.31780](https://doi.org/10.5281/ZENODO.31780) and [https://doi.org/10.5281/zenodo.31780](https://doi.org/10.5281/zenodo.31780) are the same DOI. All DOIs are [converted to upper case](https://www.doi.org/doi_handbook/2_Numbering.html#2.4) upon registration and DOI resolution, but DOIs are not consistently displayed in such a way.
|
55
55
|
|
56
|
-
|
56
|
+
## Generating cool DOIs
|
57
57
|
|
58
58
|
With all that, what should the ideal DOI look like? Its suffix should be:
|
59
59
|
|
@@ -93,6 +93,6 @@ This can be used to quickly verify a DOI, e.g. in a web form or API. The Ruby ba
|
|
93
93
|
|
94
94
|
To answer the question raised at the beginning: a cool DOI is a DOI expressed as HTTPS URI using the doi.org proxy and using a base32-encoded suffix, for example **https://doi.org/10.5555/KVTD-VPWM**. This DOI works well in a web environment, is human readable, easy to parse and detect (e.g. in text mining), and can be generated using an algorithm that is well understood and supported.
|
95
95
|
|
96
|
-
![](
|
96
|
+
![](images/2016/12/cool-dois.png)
|
97
97
|
|
98
|
-
|
98
|
+
## References
|
@@ -0,0 +1,98 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN"
|
3
|
+
"http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
|
4
|
+
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="edtorial" dtd-version="1.1">
|
5
|
+
<front>
|
6
|
+
<article-meta>
|
7
|
+
<article-id pub-id-type="doi">10.23725/0000-03VC</article-id>
|
8
|
+
<article-categories>
|
9
|
+
<subj-group subj-group-type="categories">
|
10
|
+
<subject>doi</subject>
|
11
|
+
<subject>featured</subject>
|
12
|
+
</subj-group>
|
13
|
+
</article-categories>
|
14
|
+
<title-group>
|
15
|
+
<article-title>Cool DOI's</article-title>
|
16
|
+
</title-group>
|
17
|
+
<contrib-group>
|
18
|
+
<contrib contrib-type="author">
|
19
|
+
<name>
|
20
|
+
<string-name>mfenner</string-name>
|
21
|
+
</name>
|
22
|
+
</contrib>
|
23
|
+
</contrib-group>
|
24
|
+
<pub-date pub-type="epub" iso-8601-date="2016-12-15">
|
25
|
+
<string-date>2016-12-15</string-date>
|
26
|
+
</pub-date>
|
27
|
+
</article-meta>
|
28
|
+
</front>
|
29
|
+
<body>
|
30
|
+
<sec id="sec-1">
|
31
|
+
<title/>
|
32
|
+
<p>In 1998 Tim Berners-Lee coined the term cool URIs <xref ref-type="bibr" rid="r001">[1]</xref>, that is URIs that don’t change. We know that URLs referenced in the scholarly literature are often not cool, leading to link rot <xref ref-type="bibr" rid="r002">[2]</xref> and making it hard or impossible to find the referenced resource.READMORE</p>
|
33
|
+
<p>Cool URIs are, of course, a fundamental principle behind DOIs, with the two important concepts <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/3_Resolution.html" xlink:type="simple"><italic>resolution</italic></ext-link> (it is very hard to maintain a URL directly pointing at a resource) and <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/6_Policies.html" xlink:type="simple"><italic>policies</italic></ext-link> (that all DOI registration agencies and organizations minting DOIs agree to maintain the redirection). The third essential element for DOIs, their <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/4_Data_Model.html" xlink:type="simple"><italic>data model</italic></ext-link>, is not directly about persistent linking, but about the discoverability of the linked resources via standard metadata in a central index.</p>
|
34
|
+
<p>All DOIs, expressed as HTTP URI, are therefore cool URIs. So what is a cool DOI? And, furthermore, how to create and use them? To understand what a cool DOI is, we have to explain the three parts that make up a DOI:</p>
|
35
|
+
<fig id="g001"><caption>images/2016/12/doi-parts.png</caption><graphic mimetype="image" xlink:href="fig:" xlink:type="simple"/></fig>
|
36
|
+
</sec>
|
37
|
+
<sec id="sec-1.1">
|
38
|
+
<title>Proxy</title>
|
39
|
+
<p>The proxy is not part of the DOI specification, but almost all scholarly DOIs that users encounter today will be expressed as HTTP URLs. DataCite recommends that all DOIs are displayed as permanent URLs, consistent with the recommendations of other DOI registration agencies, e.g. the <ext-link ext-link-type="uri" xlink:href="http://www.crossref.org/02publishers/doi_display_guidelines.html" xlink:type="simple">Crossref DOI display guidelines</ext-link>. When the DOI system was originally designed, it was thought that the DOI protocol would become widely used, but that clearly has not happened and displaying DOIs as <bold>doi:10.5281/ZENODO.31780</bold> is therefore not recommended.</p>
|
40
|
+
<p>The DOI proxy enables the functionality of expressing DOIs as HTTP URIs. Users should also be aware of two these two recommendations:</p>
|
41
|
+
<list list-type="bullet">
|
42
|
+
<list-item>Use <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_proxy/proxy_policies.html" xlink:type="simple">doi.org</ext-link> instead of dx.doi.org as DNS name</list-item>
|
43
|
+
<list-item>Use the HTTPS protocol instead of HTTP protocol</list-item>
|
44
|
+
</list>
|
45
|
+
<p>Ed Pentz from Crossref makes the case for HTTPS in a <ext-link ext-link-type="uri" xlink:href="http://blog.crossref.org/2016/09/new-crossref-doi-display-guidelines.html" xlink:type="simple">September blog post</ext-link>. The web, and therefore also the scholarly web, is moving to HTTPS as the default. It is important that the DOI proxy redirects to HTTPS URLs, and it will take some time until all DataCite data centers use HTTPS for the landing pages their DOIs redirects to.</p>
|
46
|
+
<p>What many users don’t know is that doi.org is not the only proxy server for DOIs. DOIs use the handle system and any handle server will resolve a DOI, just as doi.org will resolve any handle. This means that <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/10.5281/ZENODO.31780" xlink:type="simple">https://hdl.handle.net/10.5281/ZENODO.31780</ext-link> will resolve to the landing page for that DOI and that <ext-link ext-link-type="uri" xlink:href="http://doi.org/10273/BGRB5054RX05201" xlink:type="simple">http://doi.org/10273/BGRB5054RX05201</ext-link> is a handle (for a <ext-link ext-link-type="uri" xlink:href="http://www.igsn.org/" xlink:type="simple">IGSN</ext-link>) and not a DOI.</p>
|
47
|
+
</sec>
|
48
|
+
<sec id="sec-1.2">
|
49
|
+
<title>Prefix</title>
|
50
|
+
<p>The DOI prefix is used as a namespace so that DOIs are globally unique without requiring global coordination for every new identifier. Prefixes in the handle system and therefore for DOIs are numbers without any semantic meaning. One lesson learned with persistent identifiers is that adding meaning to the identifier (e.g. by using a prefix with the name of the data repository) is always dangerous, because – despite best intentions – all names can change over time.</p>
|
51
|
+
<p>Since the DOI prefix is a namespace to keep DOIs globally unique, there is usually no need for multiple prefixes for one organization managing DOI assignment. The tricky part is that these responsibilities can change, e.g. when an organization manages multiple repositories and one of them is migrated to another organization. It therefore makes sense to assign one prefix per list of resources that always stays together, e.g. one repository. It is possible that one prefix is managed by multiple organizations (as long as they use the same DOI registration agency), but that makes DOI management more complex.</p>
|
52
|
+
</sec>
|
53
|
+
<sec id="sec-1.3">
|
54
|
+
<title>Suffix</title>
|
55
|
+
<p>The suffix for a DOI can be (almost) any string. Which is both a feature and a curse. It is a feature because it gives maximal flexibility, for example when migrating existing identifiers to the DOI system. And it is a curse because it not always works well in the web context, as the list of characters allowed in a URL is limited. A good example of this are SICIs (<ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier" xlink:type="simple">Serial Item and Contribution Identifier</ext-link>), they were defined in 1996 before the DOI system was implemented, and could then be migrated to DOIs. Unfortunately they can contain many characters that are problematic in a URL or make it difficult to validate the DOI, as in <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7%3C672::aid-jpp192%3E3.0.co;2-8" xlink:type="simple">https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7<672::aid-jpp192>3.0.co;2-8</ext-link>. A Crossref <ext-link ext-link-type="uri" xlink:href="http://blog.crossref.org/2015/08/doi-regular-expressions.html" xlink:type="simple">blog post</ext-link> by Andrew Gilmartin gives a good overview about the characters found in DOIs and suggests the following regular expression to check for valid DOIs:</p>
|
56
|
+
<pre><code>/^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i</code></pre>
|
57
|
+
<p>SICIs demonstrate two other pitfalls:</p>
|
58
|
+
<list list-type="bullet">
|
59
|
+
<list-item>they contain semantic information (ISSN, volume, number, etc.) that may change over time, and</list-item>
|
60
|
+
<list-item>they are long, difficult to transcribe, with characters not allowed in URLs, and not very human-readable.</list-item>
|
61
|
+
</list>
|
62
|
+
<p>Semantic information might also lead users to expect certain functionalities. A common pattern that we see at DataCite is to include information about the version or parent in the suffix, e.g. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/M9.FIGSHARE.3501629.V1" xlink:type="simple">https://doi.org/10.6084/M9.FIGSHARE.3501629.V1</ext-link> or <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5061/DRYAD.0SN63/7" xlink:type="simple">https://doi.org/10.5061/DRYAD.0SN63/7</ext-link>. While the decision on what to put into the suffix is up to each data center, we should make sure users don't think that these are functionalities of the DOI system (e.g. that adding <bold>.V2</bold> to any DOI name will resolve to version 2 of that resource).</p>
|
63
|
+
<p>Another issue to keep in mind when assigning suffixes is that DOIs – in contrast to HTTP URIs – are case-insensitive, <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/ZENODO.31780" xlink:type="simple">https://doi.org/10.5281/ZENODO.31780</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.31780" xlink:type="simple">https://doi.org/10.5281/zenodo.31780</ext-link> are the same DOI. All DOIs are <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/2_Numbering.html#2.4" xlink:type="simple">converted to upper case</ext-link> upon registration and DOI resolution, but DOIs are not consistently displayed in such a way.</p>
|
64
|
+
</sec>
|
65
|
+
<sec id="sec-1.4">
|
66
|
+
<title>Generating cool DOIs</title>
|
67
|
+
<p>With all that, what should the ideal DOI look like? Its suffix should be:</p>
|
68
|
+
<list list-type="bullet">
|
69
|
+
<list-item>opaque without semantic information</list-item>
|
70
|
+
<list-item>work well in a web environment, avoiding characters problematic in URLs</list-item>
|
71
|
+
<list-item>short and human-readable</list-item>
|
72
|
+
<list-item>Resistant to transcription errors</list-item>
|
73
|
+
<list-item>easy to generate</list-item>
|
74
|
+
</list>
|
75
|
+
<p>On Tuesday DataCite released a tool that helps generating such a suffix, an open source command line tool called <ext-link ext-link-type="uri" xlink:href="https://github.com/datacite/cirneco" xlink:type="simple">cirneco</ext-link> (a lot of our open source software uses Italian dog breed names). Cirneco is a Ruby gem that can be installed via</p>
|
76
|
+
<pre><code>gem install cirneco</code></pre>
|
77
|
+
<p>Cirneco uses base32 encoding, as <ext-link ext-link-type="uri" xlink:href="http://www.crockford.com/wrmg/base32.html" xlink:type="simple">described</ext-link> by Douglas Crockford. The encoding starts with a randomly generated number to guarantee uniqueness of the identifier, and then encodes the number into a string that uses all numbers and uppercase letters. It avoids the letters I, O and L as they can be confused with the letter 1 and 0, using 32 characters (and 5 checksum characters) in total. The last character is a checksum. The resulting string from cirneco always has a length of 8 characters, in groups of 4 separated by a hyphen to help with readability. The advantage of base32 encoding over using only numbers (as for example ORCID is doing) is that the resulting string becomes much more compact, the available 7 characters (plus one for the checksum) can encode 34,359,738,367 strings, compared to 10 million when only using numbers. This number is large enough that the resulting suffix will not only be unique for a given prefix, but also unique for all DOIs (there is a very small chance to get the same random number twice, but this will be rejected when trying to register the DOI).</p>
|
78
|
+
<p>Another common way to generate random strings would have been universally unique identifiers (<ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Universally_unique_identifier" xlink:type="simple">UUID</ext-link>), but they are long and not very human-readable, e.g. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.4233/UUID:6D192FE2-DE18-4556-873A-D3CD56AB96A6" xlink:type="simple">https://doi.org/10.4233/UUID:6D192FE2-DE18-4556-873A-D3CD56AB96A6</ext-link>.</p>
|
79
|
+
<p>An example DOI generated by cirneco would be</p>
|
80
|
+
<pre><code>cirneco doi generate --prefix 10.5555
|
81
|
+
10.5555/KVTD-VPWM</code></pre>
|
82
|
+
<p>The generated DOI is short enough that it should work well in places where space is limited, providing an alternative to the <ext-link ext-link-type="uri" xlink:href="http://shortdoi.org/" xlink:type="simple">ShortDOI</ext-link> service which shortens existing DOIs, but does this by adding another layer on top of the DOI proxy.</p>
|
83
|
+
<p>Another cirneco command checks that this is a valid bas32 string using the checksum</p>
|
84
|
+
<pre><code>cirneco doi check 10.5555/KVTD-VPWM
|
85
|
+
Checksum for 10.5555/KVTD-VPWM is valid</code></pre>
|
86
|
+
<p>This can be used to quickly verify a DOI, e.g. in a web form or API. The Ruby base32 encoding library used by cirneco is open source (<ext-link ext-link-type="uri" xlink:href="https://github.com/datacite/base32" xlink:type="simple">https://github.com/datacite/base32</ext-link>. I added the checksum to the existing library), and implementations of the Crockford base32 encoding pattern are available in many other languages, including <ext-link ext-link-type="uri" xlink:href="https://github.com/jbittel/base32-crockford" xlink:type="simple">Python</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://github.com/dflydev/dflydev-base32-crockford" xlink:type="simple">PHP</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://www.npmjs.com/package/base32-crockford" xlink:type="simple">Javascript</ext-link>, <ext-link ext-link-type="uri" xlink:href="http://stackoverflow.com/questions/22385467/crockford-base32-encoding-for-large-number-java-implementation" xlink:type="simple">Java</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://github.com/richardlehane/crock32" xlink:type="simple">Go</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://crockfordbase32.codeplex.com/" xlink:type="simple">.NET</ext-link>.</p>
|
87
|
+
<p>To answer the question raised at the beginning: a cool DOI is a DOI expressed as HTTPS URI using the doi.org proxy and using a base32-encoded suffix, for example <bold>https://doi.org/10.5555/KVTD-VPWM</bold>. This DOI works well in a web environment, is human readable, easy to parse and detect (e.g. in text mining), and can be generated using an algorithm that is well understood and supported.</p>
|
88
|
+
<fig id="g002"><caption>images/2016/12/cool-dois.png</caption><graphic mimetype="image" xlink:href="fig:" xlink:type="simple"/></fig>
|
89
|
+
</sec>
|
90
|
+
</body>
|
91
|
+
<back>
|
92
|
+
<ref-list><title>References</title><ref id="r001"><label>1</label>
|
93
|
+
<element-citation publication-type="standard"><person-group person-group-type="author"><name><surname>Berners-Lee</surname><given-names>Tim</given-names></name></person-group><article-title>Hypertext Style: Cool URIs don’t change.</article-title><date-in-citation content-type="access-date" iso-8601-date="2016-12-14"><day>14</day><month>12</month><year>2016</year></date-in-citation><ext-link ext-link-type="uri" xlink:href="https://www.w3.org/Provider/Style/URI" xlink:type="simple">https://www.w3.org/Provider/Style/URI</ext-link></element-citation>
|
94
|
+
</ref>
|
95
|
+
<ref id="r002"><label>2</label>
|
96
|
+
<element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Klein</surname><given-names>Martin</given-names></name><name><surname>Sompel</surname><given-names>Herbert Van de</given-names></name><name><surname>Sanderson</surname><given-names>Robert</given-names></name><name><surname>Shankar</surname><given-names>Harihar</given-names></name><name><surname>Balakireva</surname><given-names>Lyudmila</given-names></name><name><surname>Zhou</surname><given-names>Ke</given-names></name><name><surname>Tobin</surname><given-names>Richard</given-names></name></person-group><article-title>Scholarly Context Not Found: One in Five Articles Suffers from Reference Rot</article-title><source>PLOS ONE</source><date iso-8601-date="2014-12"><month>12</month><year>2014</year></date><volume>9</volume><issue>12</issue><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0115253" xlink:type="simple">10.1371/journal.pone.0115253</ext-link><ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253" xlink:type="simple">http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253</ext-link></element-citation>
|
97
|
+
</ref></ref-list></back>
|
98
|
+
</article>
|
data/spec/jats_spec.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Bergamasco::Jats do
|
4
|
+
subject { Bergamasco::Jats }
|
5
|
+
|
6
|
+
it 'should convert to jats' do
|
7
|
+
filepath = fixture_path + 'cool-dois.html.md'
|
8
|
+
file = IO.read(filepath)
|
9
|
+
xml = subject.render_jats(file, skip_yaml_header: true, csl: 'spec/fixtures/apa.csl', bibliography: 'spec/fixtures/references.yaml')
|
10
|
+
doc = Nokogiri::XML(xml)
|
11
|
+
article_id = doc.at_xpath("//article-id")
|
12
|
+
expect(article_id.text).to eq("10.23725/0000-03VC")
|
13
|
+
expect(article_id.values.first).to eq("doi")
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should write jats xml' do
|
17
|
+
filepath = fixture_path + 'cool-dois.html.md'
|
18
|
+
xml_path = subject.write_jats(filepath, skip_yaml_header: true, csl: 'spec/fixtures/apa.csl', bibliography: 'spec/fixtures/references.yaml')
|
19
|
+
doc = File.open(xml_path) { |f| Nokogiri::XML(f) }
|
20
|
+
article_id = doc.at_xpath("//article-id")
|
21
|
+
expect(article_id.text).to eq("10.23725/0000-03VC")
|
22
|
+
expect(article_id.values.first).to eq("doi")
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN"
|
3
|
+
"http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
|
4
|
+
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="edtorial" dtd-version="1.1">
|
5
|
+
<front>
|
6
|
+
$if(publisher)$
|
7
|
+
<journal-meta>
|
8
|
+
<publisher>
|
9
|
+
<publisher-name>$publisher$</publisher-name>
|
10
|
+
</publisher>
|
11
|
+
</journal-meta>
|
12
|
+
$endif$
|
13
|
+
<article-meta>
|
14
|
+
$if(doi)$
|
15
|
+
<article-id pub-id-type="doi">$doi$</article-id>
|
16
|
+
$endif$
|
17
|
+
$if(tags)$
|
18
|
+
<article-categories>
|
19
|
+
<subj-group subj-group-type="categories">
|
20
|
+
$for(tags)$
|
21
|
+
<subject>$tags$</subject>
|
22
|
+
$endfor$
|
23
|
+
</subj-group>
|
24
|
+
</article-categories>
|
25
|
+
$endif$
|
26
|
+
$if(title)$
|
27
|
+
<title-group>
|
28
|
+
<article-title>$title$</article-title>
|
29
|
+
</title-group>
|
30
|
+
$endif$
|
31
|
+
$if(author)$
|
32
|
+
<contrib-group>
|
33
|
+
$for(author)$
|
34
|
+
<contrib contrib-type="author">
|
35
|
+
$if(author.orcid)$
|
36
|
+
<contrib-id contrib-id-type="orcid">$author.orcid$</contrib-id>
|
37
|
+
$endif$
|
38
|
+
<name>
|
39
|
+
$if(author.family_name)$
|
40
|
+
<surname>$author.family_name$</surname>
|
41
|
+
<given-names>$author.given_name$</given-names>
|
42
|
+
$else$
|
43
|
+
<string-name>$author$</string-name>
|
44
|
+
$endif$
|
45
|
+
</name>
|
46
|
+
</contrib>
|
47
|
+
$endfor$
|
48
|
+
</contrib-group>
|
49
|
+
$endif$
|
50
|
+
$if(date)$
|
51
|
+
<pub-date pub-type="epub" iso-8601-date="$date$">
|
52
|
+
$if(publication_day)$
|
53
|
+
<day>$publication_day$</day>
|
54
|
+
$endif$
|
55
|
+
$if(publication_month)$
|
56
|
+
<month>$publication_month$</month>
|
57
|
+
$endif$
|
58
|
+
$if(publication_year)$
|
59
|
+
<year>$publication_year$</year>
|
60
|
+
$else$
|
61
|
+
<string-date>$date$</string-date>
|
62
|
+
$endif$
|
63
|
+
</pub-date>
|
64
|
+
$endif$
|
65
|
+
$if(license_name)$
|
66
|
+
<permissions>
|
67
|
+
<license license-type="open-access" xlink:href="$license_url$">
|
68
|
+
<license-p>$license_name$</license-p>
|
69
|
+
</license>
|
70
|
+
</permissions>
|
71
|
+
$endif$
|
72
|
+
$if(subjects)$
|
73
|
+
<kwd-group kwd-group-type="author">
|
74
|
+
$for(subjects)$
|
75
|
+
<kwd>$subjects$</kwd>
|
76
|
+
$endfor$
|
77
|
+
</kwd-group>
|
78
|
+
$endif$
|
79
|
+
</article-meta>
|
80
|
+
</front>
|
81
|
+
$body$
|
82
|
+
</article>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bergamasco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.3'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martin Fenner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-12-
|
11
|
+
date: 2016-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -267,6 +267,9 @@ files:
|
|
267
267
|
- Rakefile
|
268
268
|
- bergamasco.gemspec
|
269
269
|
- lib/bergamasco.rb
|
270
|
+
- lib/bergamasco/jats.csl
|
271
|
+
- lib/bergamasco/jats.lua
|
272
|
+
- lib/bergamasco/jats.rb
|
270
273
|
- lib/bergamasco/markdown.rb
|
271
274
|
- lib/bergamasco/sanitize.rb
|
272
275
|
- lib/bergamasco/summarize.rb
|
@@ -276,13 +279,16 @@ files:
|
|
276
279
|
- spec/fixtures/apa.csl
|
277
280
|
- spec/fixtures/cool-dois-without-yml.md
|
278
281
|
- spec/fixtures/cool-dois.html.md
|
282
|
+
- spec/fixtures/cool-dois.xml
|
279
283
|
- spec/fixtures/cool-dois.yml
|
280
284
|
- spec/fixtures/references.bib
|
281
285
|
- spec/fixtures/references.yaml
|
286
|
+
- spec/jats_spec.rb
|
282
287
|
- spec/markdown_spec.rb
|
283
288
|
- spec/sanitize_spec.rb
|
284
289
|
- spec/spec_helper.rb
|
285
290
|
- spec/summarize_spec.rb
|
291
|
+
- templates/default.jats
|
286
292
|
homepage: https://github.com/datacite/bergamasco
|
287
293
|
licenses:
|
288
294
|
- MIT
|