bergamasco 0.2.14 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84f5ca3d564e4c58844e5c93cb279e8c876a727b
4
- data.tar.gz: 9f8edb6e83bb4b53d17cb5ea914d8c27ea6bf1b5
3
+ metadata.gz: adf586839173e90769dcbdeff4e32c4c0b4e4d5b
4
+ data.tar.gz: 73ea901886feefc53424da597aab4d3a9c32578b
5
5
  SHA512:
6
- metadata.gz: 1a20df578aecf8801d0cf7324c145140612e48197ec767849932823fdb6927b2c2e9f291a784b338d2cc00b7486559763e012c7bd9580f91b798aa7a1a2fcd36
7
- data.tar.gz: 8e55096957362a20b91f1832d734b0bbca6967dbed5d66419a8c9aa36811d7ba111eea13fca42a16da569ddef8841244a9a86563b3e7e273088cc641e41df774
6
+ metadata.gz: 499eeac743a1c687aacafa8d481cb32963371a2684d655aaf58a312f7b8fb97f3b17f39e5d1c5bbbc30af284a728923d2d8d494c600a892270121920654dba35
7
+ data.tar.gz: d84baf3e47763d3a38d60cf37004c1084b0a311b1055447da8b4993f373e8765cea4f76f50614789b5e5d6f1e83b11b56820c60e50850a84b79444b0c78e7053
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bergamasco (0.2.14)
4
+ bergamasco (0.3)
5
5
  activesupport (~> 4.2, >= 4.2.5)
6
6
  addressable (~> 2.3.8)
7
7
  builder (~> 3.2, >= 3.2.2)
data/lib/bergamasco.rb CHANGED
@@ -13,4 +13,5 @@ require 'addressable/uri'
13
13
  require "bergamasco/summarize"
14
14
  require "bergamasco/sanitize"
15
15
  require "bergamasco/markdown"
16
+ require "bergamasco/jats"
16
17
  require "bergamasco/whitelist_scrubber"
@@ -0,0 +1,175 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" default-locale="en-US">
3
+ <!-- This style was edited with the Visual CSL Editor (http://editor.citationstyles.org/visualEditor/) -->
4
+ <info>
5
+ <title>Journal Article Tag Suite</title>
6
+ <title-short>JATS</title-short>
7
+ <id>http://www.zotero.org/styles/journal-article-tag-suite</id>
8
+ <link href="http://www.zotero.org/styles/journal-article-tag-suite" rel="self"/>
9
+ <link rel="documentation" href="http://jats.nlm.nih.gov/archiving/tag-library/1.0/index.html"/>
10
+ <author>
11
+ <name>Martin Fenner</name>
12
+ <email>mfenner@plos.org</email>
13
+ </author>
14
+ <category citation-format="numeric"/>
15
+ <category field="medicine"/>
16
+ <category field="biology"/>
17
+ <summary>Use this style to generate bibliographic data in Journal Article Tagging Suite (JATS) 1.0 XML format</summary>
18
+ <updated>2015-04-26T17:02:43+00:00</updated>
19
+ <rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
20
+ </info>
21
+ <locale xml:lang="en">
22
+ <terms>
23
+ <term name="et-al">&lt;etal/&gt;</term>
24
+ </terms>
25
+ </locale>
26
+ <macro name="citation-label">
27
+ <text variable="citation-number" prefix="id=&quot;" suffix="&quot;&gt;"/>
28
+ <text variable="citation-number" prefix="&lt;label&gt;" suffix="&lt;/label&gt;"/>
29
+ </macro>
30
+ <macro name="author">
31
+ <names variable="author">
32
+ <name delimiter="&lt;/name&gt;&lt;name&gt;" prefix="&lt;name&gt;" suffix="&lt;/name&gt;" name-as-sort-order="all" sort-separator="">
33
+ <name-part name="family" text-case="capitalize-first" prefix="&lt;surname&gt;" suffix="&lt;/surname&gt;"/>
34
+ <name-part name="given" text-case="capitalize-first" prefix="&lt;given-names&gt;" suffix="&lt;/given-names&gt;"/>
35
+ </name>
36
+ <substitute>
37
+ <names variable="editor"/>
38
+ </substitute>
39
+ </names>
40
+ </macro>
41
+ <macro name="editor">
42
+ <group delimiter=": ">
43
+ <names variable="editor">
44
+ <name delimiter="" prefix="&lt;name&gt;" suffix="&lt;/name&gt;" name-as-sort-order="all" sort-separator="">
45
+ <name-part name="family" text-case="capitalize-first" prefix="&lt;surname&gt;" suffix="&lt;/surname&gt;"/>
46
+ <name-part name="given" text-case="capitalize-first" prefix="&lt;given-names&gt;" suffix="&lt;given-names&gt;"/>
47
+ </name>
48
+ </names>
49
+ </group>
50
+ </macro>
51
+ <macro name="title">
52
+ <text variable="title"/>
53
+ </macro>
54
+ <macro name="container-title">
55
+ <text variable="container-title" form="short" prefix="&lt;source&gt;" suffix="&lt;/source&gt;"/>
56
+ </macro>
57
+ <macro name="publisher">
58
+ <text variable="publisher" prefix="&lt;publisher-name&gt;" suffix="&lt;/publisher-name&gt;"/>
59
+ <text variable="publisher-place" prefix="&lt;publisher-loc&gt;" suffix="&lt;/publisher-loc&gt;"/>
60
+ </macro>
61
+ <macro name="link">
62
+ <choose>
63
+ <if match="any" variable="DOI">
64
+ <text variable="DOI" />
65
+ </if>
66
+ </choose>
67
+ <choose>
68
+ <if match="any" variable="PMID">
69
+ <text variable="PMID" />
70
+ </if>
71
+ </choose>
72
+ <choose>
73
+ <if variable="URL" match="any">
74
+ <text variable="URL" />
75
+ </if>
76
+ </choose>
77
+ </macro>
78
+ <macro name="date">
79
+ <choose>
80
+ <if type="article-journal article-magazine article-newspaper report patent" match="any">
81
+ <group prefix="&lt;date" suffix="&lt;/date&gt;">
82
+ <date variable="issued" prefix=" iso-8601-date=&quot;" suffix="&quot;&gt;">
83
+ <date-part name="year" range-delimiter=""/>
84
+ <date-part name="month" form="numeric-leading-zeros" range-delimiter="" prefix="-"/>
85
+ <date-part name="day" form="numeric-leading-zeros" range-delimiter="" prefix="-"/>
86
+ </date>
87
+ <date variable="issued">
88
+ <date-part name="day" form="numeric-leading-zeros" prefix="&lt;day&gt;" suffix="&lt;/day&gt;"/>
89
+ <date-part name="month" form="numeric-leading-zeros" prefix="&lt;month&gt;" suffix="&lt;/month&gt;"/>
90
+ <date-part name="year" prefix="&lt;year&gt;" suffix="&lt;/year&gt;"/>
91
+ </date>
92
+ </group>
93
+ </if>
94
+ <else>
95
+ <group prefix="&lt;date-in-citation content-type=&quot;access-date&quot;" suffix="&lt;/date-in-citation&gt;">
96
+ <date variable="accessed" prefix=" iso-8601-date=&quot;" suffix="&quot;&gt;">
97
+ <date-part name="year"/>
98
+ <date-part name="month" form="numeric-leading-zeros" prefix="-"/>
99
+ <date-part name="day" form="numeric-leading-zeros" prefix="-"/>
100
+ </date>
101
+ <date variable="accessed">
102
+ <date-part name="day" prefix="&lt;day&gt;" suffix="&lt;/day&gt;"/>
103
+ <date-part name="month" form="numeric-leading-zeros" prefix="&lt;month&gt;" suffix="&lt;/month&gt;"/>
104
+ <date-part name="year" prefix="&lt;year&gt;" suffix="&lt;/year&gt;"/>
105
+ </date>
106
+ </group>
107
+ </else>
108
+ </choose>
109
+ </macro>
110
+ <macro name="location">
111
+ <choose>
112
+ <if type="article-journal article-magazine" match="any">
113
+ <text variable="volume" prefix="&lt;volume&gt;" suffix="&lt;/volume&gt;"/>
114
+ <text variable="issue" prefix="&lt;issue&gt;" suffix="&lt;/issue&gt;"/>
115
+ </if>
116
+ </choose>
117
+ <choose>
118
+ <if type="article-journal article-magazine article-newspaper" match="any">
119
+ <text variable="page-first" prefix="&lt;fpage&gt;" suffix="&lt;/fpage&gt;"/>
120
+ </if>
121
+ </choose>
122
+ </macro>
123
+ <macro name="publication-type">
124
+ <group prefix=" publication-type=&quot;" suffix="&quot;&gt;">
125
+ <choose>
126
+ <if type="article-journal article-magazine article-newspaper" match="any">
127
+ <text value="journal"/>
128
+ </if>
129
+ <else-if type="book" match="any">
130
+ <text value="book"/>
131
+ </else-if>
132
+ <else-if type="dataset" match="any">
133
+ <text value="dataset"/>
134
+ </else-if>
135
+ <else-if type="patent" match="any">
136
+ <text value="patent"/>
137
+ </else-if>
138
+ <else-if type="report" match="any">
139
+ <text value="report"/>
140
+ </else-if>
141
+ <else-if type="review" match="any">
142
+ <text value="review"/>
143
+ </else-if>
144
+ <else>
145
+ <text value="standard"/>
146
+ </else>
147
+ </choose>
148
+ </group>
149
+ </macro>
150
+ <citation collapse="citation-number">
151
+ <sort>
152
+ <key variable="citation-number"/>
153
+ </sort>
154
+ <layout delimiter=",">
155
+ <text variable="citation-number"/>
156
+ </layout>
157
+ </citation>
158
+ <bibliography sort-separator="">
159
+ <layout>
160
+ <group prefix="&lt;ref " suffix="&lt;/ref&gt;">
161
+ <text macro="citation-label" suffix="&#10;"/>
162
+ <group prefix="&lt;element-citation" suffix="&lt;/element-citation&gt;&#10;">
163
+ <text macro="publication-type"/>
164
+ <text macro="author" prefix="&lt;person-group person-group-type=&quot;author&quot;&gt;" suffix="&lt;/person-group&gt;"/>
165
+ <text macro="title" prefix="&lt;article-title&gt;" suffix="&lt;/article-title&gt;"/>
166
+ <text macro="container-title"/>
167
+ <text macro="publisher"/>
168
+ <text macro="date"/>
169
+ <text macro="location"/>
170
+ <text macro="link"/>
171
+ </group>
172
+ </group>
173
+ </layout>
174
+ </bibliography>
175
+ </style>
@@ -0,0 +1,662 @@
1
+ -- This is a JATS custom writer for pandoc. It produces output
2
+ -- that tries to conform to the JATS 1.0 specification
3
+ -- http://jats.nlm.nih.gov/archiving/tag-library/1.0/index.html
4
+ --
5
+ -- Invoke with: pandoc -t jats.lua
6
+ --
7
+ -- Note: you need not have lua installed on your system to use this
8
+ -- custom writer. However, if you do have lua installed, you can
9
+ -- use it to test changes to the script. 'lua JATS.lua' will
10
+ -- produce informative error messages if your code contains
11
+ -- syntax errors.
12
+ --
13
+ -- Released under the GPL, version 2 or greater. See LICENSE for more info.
14
+
15
+ -- Tables to store metadata, headers, sections, back sections, references, figures and footnotes
16
+ local meta = {}
17
+ local headers = {}
18
+ local sections = {}
19
+ local back = {}
20
+ local references = {}
21
+ local figures = {}
22
+
23
+ -- This function is called once for the whole document. Parameters:
24
+ -- body is a string, metadata is a table, variables is a table.
25
+ -- This gives you a fragment. You could use the metadata table to
26
+ -- fill variables in a custom lua template. Or, pass `--template=...`
27
+ -- to pandoc, and pandoc will do the template processing as
28
+ -- usual.
29
+ function Doc(body, metadata, variables)
30
+ meta = metadata or {}
31
+
32
+ -- if document doesn't start with section, add top-level section without title
33
+ if string.sub(body, 1, 6) ~= '</sec>' then
34
+ body = Header(1, '') .. '\n' .. body
35
+ end
36
+
37
+ -- strip closing section tag from beginning, add to end of document
38
+ body = string.sub(body, 7) .. '</sec>'
39
+
40
+ -- parse sections, turn body into table of sections
41
+ for lev, title, content in string.gmatch(body, '<sec.-lev="(.-)".->%s<title>(.-)</title>(.-)</sec>') do
42
+ attr = section_helper(tonumber(lev), content, title)
43
+ end
44
+
45
+ body = xml('body', '\n' .. table.concat(sections, '\n') .. '\n')
46
+
47
+ if #back > 0 then
48
+ body = body .. '\n' .. xml('back', '\n' .. table.concat(back, '\n'))
49
+ end
50
+
51
+ return body
52
+ end
53
+
54
+ -- XML character entity escaping and unescaping
55
+ function escape(s)
56
+ local map = { ['<'] = '&lt;',
57
+ ['>'] = '&gt;',
58
+ ['&'] = '&amp;',
59
+ ['"'] = '&quot;',
60
+ ['\'']= '&#39;' }
61
+ return s:gsub("[<>&\"']", function(x) return map[x] end)
62
+ end
63
+
64
+ function unescape(s)
65
+ local map = { ['&lt;'] = '<',
66
+ ['&gt;'] = '>',
67
+ ['&amp;'] = '&',
68
+ ['&quot;'] = '"',
69
+ ['&#39;']= '\'' }
70
+ return s:gsub('(&(#?)([%d%a]+);)', function(x) return map[x] end)
71
+ end
72
+
73
+ -- Helper function to convert an attributes table into
74
+ -- a string that can be put into XML elements.
75
+ function attributes(attr)
76
+ local attr_table = {}
77
+ for x, y in pairsByKeys(attr) do
78
+ if y and y ~= '' then
79
+ table.insert(attr_table, string.format(' %s="%s"', x, escape(y)))
80
+ end
81
+ end
82
+ return table.concat(attr_table)
83
+ end
84
+
85
+ -- sort table, so that attributes are in consistent order
86
+ function pairsByKeys (t, f)
87
+ local a = {}
88
+ for n in pairs(t) do table.insert(a, n) end
89
+ table.sort(a, f)
90
+ local i = 0 -- iterator variable
91
+ local iter = function () -- iterator function
92
+ i = i + 1
93
+ if a[i] == nil then return nil
94
+ else return a[i], t[a[i]]
95
+ end
96
+ end
97
+ return iter
98
+ end
99
+
100
+ -- generic xml builder
101
+ function xml(tag, s, attr)
102
+ attr = attr and attributes(attr) or ''
103
+ s = s and '>' .. s .. '</' .. tag .. '>' or '/>'
104
+ return '<' .. tag .. attr .. s
105
+ end
106
+
107
+ -- Flatten nested table, needed for nested YAML metadata['
108
+ -- We only flatten associative arrays and create composite key,
109
+ -- numbered arrays and flat tables are left intact.
110
+ -- We also convert all hyphens in keys to underscores,
111
+ -- so that they are proper variable names
112
+ function flatten_table(tbl)
113
+ local result = {}
114
+
115
+ local function flatten(tbl, key)
116
+ for k, v in pairs(tbl) do
117
+ if type(k) == 'number' and k > 0 and k <= #tbl then
118
+ result[key] = tbl
119
+ break
120
+ else
121
+ k = (key and key .. '-' or '') .. k
122
+ if type(v) == 'table' then
123
+ flatten(v, k)
124
+ else
125
+ result[k] = v
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ flatten(tbl)
132
+ return result
133
+ end
134
+
135
+ -- Read a file from the working directory and
136
+ -- return its contents (or nil if not found).
137
+ function read_file(name)
138
+ local base, ext = name:match("([^%.]*)(.*)")
139
+ local fname = base .. ext
140
+ local file = io.open(fname, "r")
141
+ if not file then return nil end
142
+ return file:read("*all")
143
+ end
144
+
145
+ -- Parse YAML string and return table.
146
+ -- We only understand a subset.
147
+ function parse_yaml(s)
148
+ local l = {}
149
+ local c = {}
150
+ local i = 0
151
+ local k = nil
152
+
153
+ -- patterns
154
+ line_pattern = '(.-)\r?\n'
155
+ config_pattern = '^(%s*)([%w%-]+):%s*(.-)$'
156
+
157
+ -- First split string into lines
158
+ local function lines(line)
159
+ table.insert(l, line)
160
+ return ""
161
+ end
162
+
163
+ lines((s:gsub(line_pattern, lines)))
164
+
165
+ -- Then go over each line and check value and indentation
166
+ for _, v in ipairs(l) do
167
+ v:gsub(config_pattern, function(indent, tag, v)
168
+ if (v == '') then
169
+ i, k = string.len(indent), tag
170
+ c[tag] = {}
171
+ else
172
+ -- check whether value is enclosed by brackets, i.e. an array
173
+ if v:find('^%[(.-)%]$') then
174
+ arr = {};
175
+ for match in (v:sub(2, -2) .. ','):gmatch('(.-)' .. ',%s*') do
176
+ table.insert(arr, match);
177
+ end
178
+ v = arr;
179
+ else
180
+ -- if it is a string, remove optional enclosing quotes
181
+ v = v:match('^["\']*(.-)["\']*$')
182
+ end
183
+
184
+ if string.len(indent) == i + 2 and k then
185
+ c[k][tag] = v
186
+ else
187
+ c[tag] = v
188
+ end
189
+ end
190
+ end)
191
+ end
192
+
193
+ return c
194
+ end
195
+
196
+ -- add appropriate sec-type attribute
197
+ function sec_type_helper(s)
198
+ local map = { ['Abstract']= 'abstract',
199
+ ['Acknowledgments']= 'acknowledgements',
200
+ ['Author Summary']= 'author-summary',
201
+ ['Conclusions'] = 'conclusions',
202
+ ['Discussion'] = 'discussion',
203
+ ['Glossary'] = 'glossary',
204
+ ['Introduction'] = 'intro',
205
+ ['Materials and Methods'] = 'materials|methods',
206
+ ['Notes'] = 'notes',
207
+ ['References']= 'references',
208
+ ['Results']= 'results',
209
+ ['Supporting Information']= 'supplementary-material',
210
+ ['Supplementary Information']= 'supplementary-material' }
211
+ return map[s]
212
+ end
213
+
214
+ function section_helper(lev, s, title)
215
+ local attr = { ['sec-type'] = sec_type_helper(title) }
216
+
217
+ if attr['sec-type'] == "acknowledgements" then
218
+ table.insert(back, Ack(s, title))
219
+ elseif attr['sec-type'] == "references" then
220
+ table.insert(back, RefList(s, title))
221
+ elseif attr['sec-type'] == "notes" then
222
+ table.insert(back, Note(s, title))
223
+ elseif attr['sec-type'] == "glossary" then
224
+ table.insert(back, Glossary(s, title))
225
+ elseif attr['sec-type'] == "abstract" or attr['sec-type'] == "author-summary" then
226
+ -- discard, should be provided via metadata
227
+ elseif attr['sec-type'] == "supplementary-material" then
228
+ table.insert(sections, SupplementaryMaterial(s, title))
229
+ else
230
+ table.insert(sections, Section(lev, s, title, attr))
231
+ end
232
+
233
+ return attr
234
+ end
235
+
236
+ -- Create table with year, month, day and iso8601-formatted date
237
+ -- Input is iso8601-formatted date as string
238
+ -- Return nil if input is not a valid date
239
+ function date_helper(iso_date)
240
+ if not iso_date or string.len(iso_date) ~= 10 then return nil end
241
+
242
+ _,_,y,m,d = string.find(iso_date, '(%d+)-(%d+)-(%d+)')
243
+ time = os.time({ year = y, month = m, day = d })
244
+ date = os.date('*t', time)
245
+ date.iso8601 = string.format('%04d-%02d-%02d', date.year, date.month, date.day)
246
+ return date
247
+ end
248
+
249
+ -- Create affiliation table, linked to authors via aff-id
250
+ function affiliation_helper(tbl)
251
+
252
+ set = {}
253
+ i = 0
254
+ for _,author in ipairs(tbl.author) do
255
+ if author.affiliation then
256
+ if not set[author.affiliation] then
257
+ i = i + 1
258
+ set[author.affiliation] = i
259
+ end
260
+ author['aff-id'] = set[author.affiliation]
261
+ end
262
+ end
263
+
264
+ tbl.aff = {}
265
+ for k,v in pairs(set) do
266
+ aff = { id = v, name = k }
267
+ table.insert(tbl.aff, aff)
268
+ end
269
+
270
+ return tbl
271
+ end
272
+
273
+ -- Create corresponding author table, linked to authors via cor-id
274
+ function corresp_helper(tbl)
275
+
276
+ set = {}
277
+ i = 0
278
+ for _,author in ipairs(tbl.author) do
279
+ if author.corresp and author.email then
280
+ i = i + 1
281
+ set[i] = author.email
282
+ author['cor-id'] = i
283
+ end
284
+ end
285
+
286
+ tbl.corresp = {}
287
+ for k,v in pairs(set) do
288
+ corresp = { id = k, email = v }
289
+ table.insert(tbl.corresp, corresp)
290
+ end
291
+
292
+ return tbl
293
+ end
294
+
295
+ -- temporary fix
296
+ function fix_citeproc(s)
297
+ s = s:gsub('</surname>, ', '</surname>')
298
+ s = s:gsub('</name></name><name>','</name>')
299
+ return s
300
+ end
301
+
302
+ -- Convert pandoc alignment to something HTML can use.
303
+ -- align is AlignLeft, AlignRight, AlignCenter, or AlignDefault.
304
+ function html_align(align)
305
+ local map = { ['AlignRight']= 'right',
306
+ ['AlignCenter']= 'center' }
307
+ return map[align] or 'left'
308
+ end
309
+
310
+ -- Blocksep is used to separate block elements.
311
+ function Blocksep()
312
+ return "\n"
313
+ end
314
+
315
+ -- The functions that follow render corresponding pandoc elements.
316
+ -- s is always a string, attr is always a table of attributes, and
317
+ -- items is always an array of strings (the items in a list).
318
+ -- Comments indicate the types of other variables.
319
+ -- Defined at https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Writers/Custom.hs
320
+
321
+ -- block elements
322
+
323
+ function Plain(s)
324
+ return s
325
+ end
326
+
327
+ function Para(s)
328
+ return xml('p', s)
329
+ end
330
+
331
+ function RawBlock(s)
332
+ return xml('preformat', s)
333
+ end
334
+
335
+ -- JATS restricts use to inside table cells (<td> and <th>)
336
+ function HorizontalRule()
337
+ return '<hr/>'
338
+ end
339
+
340
+ -- lev is an integer, the header level.
341
+ -- we can't use closing tags, as we don't know the end of the section
342
+ function Header(lev, s, attr)
343
+ attr = attr or {}
344
+ attr['lev'] = '' .. lev
345
+ return '</sec>\n<sec' .. attributes(attr) .. '>\n' .. xml('title', s)
346
+ end
347
+
348
+ function Note(s)
349
+ return s
350
+ end
351
+
352
+ function CodeBlock(s, attr)
353
+ -- If code block has class 'dot', pipe the contents through dot
354
+ -- and base64, and include the base64-encoded png as a data: URL.
355
+ if attr.class and string.match(' ' .. attr.class .. ' ',' dot ') then
356
+ local png = pipe("base64", pipe("dot -Tpng", s))
357
+ return '<img src="data:image/png;base64,' .. png .. '"/>'
358
+ -- otherwise treat as code (one could pipe through a highlighter)
359
+ else
360
+ return "<pre><code" .. attributes(attr) .. ">" .. escape(s) ..
361
+ "</code></pre>"
362
+ end
363
+ end
364
+
365
+ function BlockQuote(s)
366
+ xml('boxed-text', s)
367
+ end
368
+
369
+ -- Caption is a string, aligns is an array of strings,
370
+ -- widths is an array of floats, headers is an array of
371
+ -- strings, rows is an array of arrays of strings.
372
+ function Table(caption, aligns, widths, headers, rows)
373
+ local buffer = {}
374
+ local function add(s)
375
+ table.insert(buffer, s)
376
+ end
377
+ table.insert(buffer, '<table-wrap>')
378
+ if caption ~= '' then
379
+ -- if caption begins with <bold> text, make it the <title>
380
+ caption = string.gsub('<p>' .. caption, "^<p><bold>(.-)</bold>%s", "<title>%1</title>\n<p>")
381
+ add(xml('caption>', caption))
382
+ end
383
+ add("<table>")
384
+ if widths and widths[1] ~= 0 then
385
+ for _, w in pairs(widths) do
386
+ add('<col width="' .. string.format("%d%%", w * 100) .. '" />')
387
+ end
388
+ end
389
+ local header_row = {}
390
+ local empty_header = true
391
+ for i, h in pairs(headers) do
392
+ local align = html_align(aligns[i])
393
+
394
+ -- remove <p> tag
395
+ h = h:gsub("^<p>(.-)</p>", "%1")
396
+
397
+ table.insert(header_row,'<th align="' .. align .. '">' .. h .. '</th>')
398
+ empty_header = empty_header and h == ""
399
+ end
400
+ if empty_header then
401
+ head = ""
402
+ else
403
+ add('<tr>')
404
+ for _,h in pairs(header_row) do
405
+ add(h)
406
+ end
407
+ add('</tr>')
408
+ end
409
+ for _, row in pairs(rows) do
410
+ add('<tr>')
411
+ for i,c in pairs(row) do
412
+ -- remove <p> tag
413
+ c = c:gsub("^<p>(.-)</p>", "%1")
414
+ add('<td align="' .. html_align(aligns[i]) .. '">' .. c .. '</td>')
415
+ end
416
+ add('</tr>')
417
+ end
418
+ add('</table>\n</table-wrap>')
419
+ return table.concat(buffer,'\n')
420
+ end
421
+
422
+ function BulletList(items)
423
+ local attr = { ['list-type'] = 'bullet' }
424
+ return List(items, attr)
425
+ end
426
+
427
+ function OrderedList(items)
428
+ local attr = { ['list-type'] = 'order' }
429
+ return List(items, attr)
430
+ end
431
+
432
+ function List(items, attr)
433
+ local buffer = {}
434
+ for _, item in pairs(items) do
435
+ table.insert(buffer, xml('list-item', item))
436
+ end
437
+ return xml('list', '\n' .. table.concat(buffer, '\n') .. '\n', attr)
438
+ end
439
+
440
+ -- Revisit association list StackValue instance.
441
+ -- items is a table of tables
442
+ function DefinitionList(items)
443
+ local buffer = {}
444
+ for _,item in pairs(items) do
445
+ for k, v in pairs(item) do
446
+ local term = xml('term', k)
447
+ local def = xml('def', table.concat(v,'</def><def>'))
448
+ table.insert(buffer, xml('def-item', term .. def))
449
+ end
450
+ end
451
+ return xml('def-list', '\n' .. table.concat(buffer, '\n') .. '\n')
452
+ end
453
+
454
+ function Div(s, attr)
455
+ return s
456
+ end
457
+
458
+ -- custom block elements for JATS
459
+
460
+ -- section is generated after header to allow reordering
461
+ function Section(lev, s, title, attr)
462
+ local last = headers[#headers]
463
+ local h = last and last.h or {}
464
+ h[lev] = (h[lev] or 0) + 1
465
+ for i = lev + 1, #headers do
466
+ table.remove(h, i)
467
+ end
468
+
469
+ local header = { ['h'] = h,
470
+ ['title'] = title,
471
+ ['id'] = 'sec-' .. table.concat(h,'.'),
472
+ ['sec-type'] = attr['sec-type'] }
473
+
474
+ table.insert(headers, header)
475
+
476
+ attr = { ['id'] = header['id'], ['sec-type'] = header['sec-type'] }
477
+ title = xml('title', title ~= '' and title or nil)
478
+ return xml('sec', '\n' .. title .. s, attr)
479
+ end
480
+
481
+ function SupplementaryMaterial(s, title, attr)
482
+ attr = {}
483
+ title = xml('title', title)
484
+ local caption = xml('caption', title .. s)
485
+ return xml('supplementary-material', '\n' .. caption .. '\n', attr)
486
+ end
487
+
488
+ function Ack(s, title)
489
+ title = title and '\n' .. xml('title', title) or ''
490
+ return xml('ack', title .. s)
491
+ end
492
+
493
+ function Glossary(s, title, attr)
494
+ title = xml('title', title)
495
+ return xml('glossary', title .. s, attr)
496
+ end
497
+
498
+ function RefList(s, title)
499
+ s = fix_citeproc(s)
500
+
501
+ -- format ids
502
+ s = string.gsub(s, '<ref id="(%d+)">', function (r)
503
+ local attr = { ['id'] = string.format('r%03d', tonumber(r)) }
504
+ return '<ref ' .. attributes(attr) .. '>'
505
+ end)
506
+
507
+ for ref in string.gmatch(s, '(<ref.-</ref>)') do
508
+ Ref(ref)
509
+ end
510
+
511
+ if #references > 0 then
512
+ title = xml('title', title)
513
+ return xml('ref-list', title .. table.concat(references, '\n'), attr)
514
+ else
515
+ return ''
516
+ end
517
+ end
518
+
519
+ function Ref(s)
520
+ table.insert(references, s)
521
+ return #references
522
+ end
523
+
524
+ -- inline elements
525
+
526
+ function Str(s)
527
+ return s
528
+ end
529
+
530
+ function Space()
531
+ return ' '
532
+ end
533
+
534
+ function Emph(s)
535
+ return xml('italic', s)
536
+ end
537
+
538
+ function Strong(s)
539
+ return xml('bold', s)
540
+ end
541
+
542
+ function Strikeout(s)
543
+ return xml('strike', s)
544
+ end
545
+
546
+ function Superscript(s)
547
+ return xml('sup', s)
548
+ end
549
+
550
+ function Subscript(s)
551
+ return xml('sub', s)
552
+ end
553
+
554
+ function SmallCaps(s)
555
+ return xml('sc', s)
556
+ end
557
+
558
+ function SingleQuoted(s)
559
+ return "'" .. s .. "'"
560
+ end
561
+
562
+ function DoubleQuoted(s)
563
+ return '"' .. s .. '"'
564
+ end
565
+
566
+ -- format in-text citation
567
+ function Cite(s)
568
+ local ids = {}
569
+ for id in string.gmatch(s, '(%d+)') do
570
+ id = tonumber(id)
571
+ -- workaround to discard year mistakenly taken for key
572
+ if id and id < 1000 then
573
+ local attr = { ['ref-type'] = 'bibr',
574
+ ['rid'] = string.format("r%03d", id) }
575
+ table.insert(ids, xml('xref', '[' .. id .. ']', attr))
576
+ end
577
+ end
578
+ if #ids > 0 then
579
+ return table.concat(ids)
580
+ else
581
+ -- return original key for backwards compatibility
582
+ return s
583
+ end
584
+ end
585
+
586
+ function Code(s, attr)
587
+ return xml('preformat', s, attr)
588
+ end
589
+
590
+ function DisplayMath(s)
591
+ return xml('disp-formula', s)
592
+ end
593
+
594
+ function InlineMath(s)
595
+ return xml('inline-formula', s)
596
+ end
597
+
598
+ function RawInline(s)
599
+ return xml('preformat', s)
600
+ end
601
+
602
+ function LineBreak()
603
+ return ' '
604
+ end
605
+
606
+ function Link(s, src, title)
607
+ if src ~= '' and s ~= '' then
608
+ attr = { ['ext-link-type'] = 'uri',
609
+ ['xlink:href'] = escape(src),
610
+ ['xlink:title'] = escape(title),
611
+ ['xlink:type'] = 'simple' }
612
+
613
+ return xml('ext-link', escape(s), attr)
614
+ else
615
+ return s
616
+ end
617
+ end
618
+
619
+ function CaptionedImage(s, src, title)
620
+ -- if title begins with <bold> text, make it the <title>
621
+ title = string.gsub(title, "^<bold>(.-)</bold>%s", function(t) xml('title', t) end)
622
+ local num = #figures + 1
623
+ local attr = { ['id'] = string.format("g%03d", num) }
624
+ local caption = xml('caption', s)
625
+ local fig = xml('fig', caption .. Image(nil, src, title), attr)
626
+
627
+ table.insert(figures, fig)
628
+ return fig
629
+ end
630
+
631
+ function Image(s, src, title)
632
+ local attr = { ['mimetype'] = 'image',
633
+ ['xlink:href'] = escape(src),
634
+ ['xlink:title'] = escape(title),
635
+ ['xlink:type'] = 'simple' }
636
+
637
+ return xml('graphic', s, attr)
638
+ end
639
+
640
+ -- handle bold and italic
641
+ function Span(s, attr)
642
+ if attr.style == "font-weight:bold" then
643
+ return Strong(s)
644
+ elseif attr.style == "font-style:italic" then
645
+ return Emph(s)
646
+ elseif attr.style == "font-variant: small-caps" then
647
+ return SmallCaps(s)
648
+ else
649
+ return s
650
+ end
651
+ end
652
+
653
+ -- The following code will produce runtime warnings when you haven't defined
654
+ -- all of the functions you need for the custom writer, so it's useful
655
+ -- to include when you're working on a writer.
656
+ local meta = {}
657
+ meta.__index =
658
+ function(_, key)
659
+ io.stderr:write(string.format("WARNING: Undefined function '%s'\n",key))
660
+ return function() return "" end
661
+ end
662
+ setmetatable(_G, meta)
@@ -0,0 +1,32 @@
1
+ module Bergamasco
2
+ module Jats
3
+
4
+ def self.render_jats(text, options={})
5
+ options = options.merge(template: "templates/default.jats",
6
+ to: "lib/bergamasco/jats.lua",
7
+ csl: "lib/bergamasco/jats.csl")
8
+ options = options.merge(metadata: options[:metadata]) if options[:metadata].present?
9
+ converter = PandocRuby.new(text, options.except(:skip_yaml_header,
10
+ :separator,
11
+ :sitepath,
12
+ :authorpath,
13
+ :referencespath,
14
+ :username,
15
+ :password,
16
+ :sandbox,
17
+ :prefix))
18
+ converter.convert
19
+ rescue Errno::ENOENT
20
+ # if pandoc is not installed.
21
+ puts "Pandoc is not installed"
22
+ end
23
+
24
+ def self.write_jats(filepath, options={})
25
+ file = IO.read(filepath)
26
+ xml_path = File.join(File.dirname(filepath), File.basename(filepath, ".html.md")) + ".xml"
27
+ xml = render_jats(file, options)
28
+ IO.write(xml_path, xml)
29
+ xml_path
30
+ end
31
+ end
32
+ end
@@ -1,3 +1,3 @@
1
1
  module Bergamasco
2
- VERSION = "0.2.14"
2
+ VERSION = "0.3"
3
3
  end
@@ -15,9 +15,9 @@ Cool URIs are, of course, a fundamental principle behind DOIs, with the two impo
15
15
 
16
16
  All DOIs, expressed as HTTP URI, are therefore cool URIs. So what is a cool DOI? And, furthermore, how to create and use them? To understand what a cool DOI is, we have to explain the three parts that make up a DOI:
17
17
 
18
- ![](/images/2016/12/doi-parts.png)
18
+ ![](images/2016/12/doi-parts.png)
19
19
 
20
- ### Proxy
20
+ ## Proxy
21
21
 
22
22
  The proxy is not part of the DOI specification, but almost all scholarly DOIs that users encounter today will be expressed as HTTP URLs. DataCite recommends that all DOIs are displayed as permanent URLs, consistent with the recommendations of other DOI registration agencies, e.g. the [Crossref DOI display guidelines](http://www.crossref.org/02publishers/doi_display_guidelines.html). When the DOI system was originally designed, it was thought that the DOI protocol would become widely used, but that clearly has not happened and displaying DOIs as **doi:10.5281/ZENODO.31780** is therefore not recommended.
23
23
 
@@ -30,13 +30,13 @@ Ed Pentz from Crossref makes the case for HTTPS in a [September blog post](http:
30
30
 
31
31
  What many users don’t know is that doi.org is not the only proxy server for DOIs. DOIs use the handle system and any handle server will resolve a DOI, just as doi.org will resolve any handle. This means that [https://hdl.handle.net/10.5281/ZENODO.31780](https://hdl.handle.net/10.5281/ZENODO.31780) will resolve to the landing page for that DOI and that [http://doi.org/10273/BGRB5054RX05201](http://doi.org/10273/BGRB5054RX05201) is a handle (for a [IGSN](http://www.igsn.org/)) and not a DOI.
32
32
 
33
- ### Prefix
33
+ ## Prefix
34
34
 
35
35
  The DOI prefix is used as a namespace so that DOIs are globally unique without requiring global coordination for every new identifier. Prefixes in the handle system and therefore for DOIs are numbers without any semantic meaning. One lesson learned with persistent identifiers is that adding meaning to the identifier (e.g. by using a prefix with the name of the data repository) is always dangerous, because – despite best intentions – all names can change over time.
36
36
 
37
37
  Since the DOI prefix is a namespace to keep DOIs globally unique, there is usually no need for multiple prefixes for one organization managing DOI assignment. The tricky part is that these responsibilities can change, e.g. when an organization manages multiple repositories and one of them is migrated to another organization. It therefore makes sense to assign one prefix per list of resources that always stays together, e.g. one repository. It is possible that one prefix is managed by multiple organizations (as long as they use the same DOI registration agency), but that makes DOI management more complex.
38
38
 
39
- ### Suffix
39
+ ## Suffix
40
40
 
41
41
  The suffix for a DOI can be (almost) any string. Which is both a feature and a curse. It is a feature because it gives maximal flexibility, for example when migrating existing identifiers to the DOI system. And it is a curse because it not always works well in the web context, as the list of characters allowed in a URL is limited. A good example of this are SICIs ([Serial Item and Contribution Identifier](https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier)), they were defined in 1996 before the DOI system was implemented, and could then be migrated to DOIs. Unfortunately they can contain many characters that are problematic in a URL or make it difficult to validate the DOI, as in [https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7<672::aid-jpp192>3.0.co;2-8](https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7<672::aid-jpp192>3.0.co;2-8). A Crossref [blog post](http://blog.crossref.org/2015/08/doi-regular-expressions.html) by Andrew Gilmartin gives a good overview about the characters found in DOIs and suggests the following regular expression to check for valid DOIs:
42
42
 
@@ -53,7 +53,7 @@ Semantic information might also lead users to expect certain functionalities. A
53
53
 
54
54
  Another issue to keep in mind when assigning suffixes is that DOIs – in contrast to HTTP URIs – are case-insensitive, [https://doi.org/10.5281/ZENODO.31780](https://doi.org/10.5281/ZENODO.31780) and [https://doi.org/10.5281/zenodo.31780](https://doi.org/10.5281/zenodo.31780) are the same DOI. All DOIs are [converted to upper case](https://www.doi.org/doi_handbook/2_Numbering.html#2.4) upon registration and DOI resolution, but DOIs are not consistently displayed in such a way.
55
55
 
56
- ### Generating cool DOIs
56
+ ## Generating cool DOIs
57
57
 
58
58
  With all that, what should the ideal DOI look like? Its suffix should be:
59
59
 
@@ -93,6 +93,6 @@ This can be used to quickly verify a DOI, e.g. in a web form or API. The Ruby ba
93
93
 
94
94
  To answer the question raised at the beginning: a cool DOI is a DOI expressed as HTTPS URI using the doi.org proxy and using a base32-encoded suffix, for example **https://doi.org/10.5555/KVTD-VPWM**. This DOI works well in a web environment, is human readable, easy to parse and detect (e.g. in text mining), and can be generated using an algorithm that is well understood and supported.
95
95
 
96
- ![](/images/2016/12/cool-dois.png)
96
+ ![](images/2016/12/cool-dois.png)
97
97
 
98
- ### References
98
+ ## References
@@ -0,0 +1,98 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN"
3
+ "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
4
+ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="edtorial" dtd-version="1.1">
5
+ <front>
6
+ <article-meta>
7
+ <article-id pub-id-type="doi">10.23725/0000-03VC</article-id>
8
+ <article-categories>
9
+ <subj-group subj-group-type="categories">
10
+ <subject>doi</subject>
11
+ <subject>featured</subject>
12
+ </subj-group>
13
+ </article-categories>
14
+ <title-group>
15
+ <article-title>Cool DOI's</article-title>
16
+ </title-group>
17
+ <contrib-group>
18
+ <contrib contrib-type="author">
19
+ <name>
20
+ <string-name>mfenner</string-name>
21
+ </name>
22
+ </contrib>
23
+ </contrib-group>
24
+ <pub-date pub-type="epub" iso-8601-date="2016-12-15">
25
+ <string-date>2016-12-15</string-date>
26
+ </pub-date>
27
+ </article-meta>
28
+ </front>
29
+ <body>
30
+ <sec id="sec-1">
31
+ <title/>
32
+ <p>In 1998 Tim Berners-Lee coined the term cool URIs <xref ref-type="bibr" rid="r001">[1]</xref>, that is URIs that don’t change. We know that URLs referenced in the scholarly literature are often not cool, leading to link rot <xref ref-type="bibr" rid="r002">[2]</xref> and making it hard or impossible to find the referenced resource.READMORE</p>
33
+ <p>Cool URIs are, of course, a fundamental principle behind DOIs, with the two important concepts <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/3_Resolution.html" xlink:type="simple">&lt;italic&gt;resolution&lt;/italic&gt;</ext-link> (it is very hard to maintain a URL directly pointing at a resource) and <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/6_Policies.html" xlink:type="simple">&lt;italic&gt;policies&lt;/italic&gt;</ext-link> (that all DOI registration agencies and organizations minting DOIs agree to maintain the redirection). The third essential element for DOIs, their <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/4_Data_Model.html" xlink:type="simple">&lt;italic&gt;data model&lt;/italic&gt;</ext-link>, is not directly about persistent linking, but about the discoverability of the linked resources via standard metadata in a central index.</p>
34
+ <p>All DOIs, expressed as HTTP URI, are therefore cool URIs. So what is a cool DOI? And, furthermore, how to create and use them? To understand what a cool DOI is, we have to explain the three parts that make up a DOI:</p>
35
+ <fig id="g001"><caption>images/2016/12/doi-parts.png</caption><graphic mimetype="image" xlink:href="fig:" xlink:type="simple"/></fig>
36
+ </sec>
37
+ <sec id="sec-1.1">
38
+ <title>Proxy</title>
39
+ <p>The proxy is not part of the DOI specification, but almost all scholarly DOIs that users encounter today will be expressed as HTTP URLs. DataCite recommends that all DOIs are displayed as permanent URLs, consistent with the recommendations of other DOI registration agencies, e.g. the <ext-link ext-link-type="uri" xlink:href="http://www.crossref.org/02publishers/doi_display_guidelines.html" xlink:type="simple">Crossref DOI display guidelines</ext-link>. When the DOI system was originally designed, it was thought that the DOI protocol would become widely used, but that clearly has not happened and displaying DOIs as <bold>doi:10.5281/ZENODO.31780</bold> is therefore not recommended.</p>
40
+ <p>The DOI proxy enables the functionality of expressing DOIs as HTTP URIs. Users should also be aware of two these two recommendations:</p>
41
+ <list list-type="bullet">
42
+ <list-item>Use <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_proxy/proxy_policies.html" xlink:type="simple">doi.org</ext-link> instead of dx.doi.org as DNS name</list-item>
43
+ <list-item>Use the HTTPS protocol instead of HTTP protocol</list-item>
44
+ </list>
45
+ <p>Ed Pentz from Crossref makes the case for HTTPS in a <ext-link ext-link-type="uri" xlink:href="http://blog.crossref.org/2016/09/new-crossref-doi-display-guidelines.html" xlink:type="simple">September blog post</ext-link>. The web, and therefore also the scholarly web, is moving to HTTPS as the default. It is important that the DOI proxy redirects to HTTPS URLs, and it will take some time until all DataCite data centers use HTTPS for the landing pages their DOIs redirects to.</p>
46
+ <p>What many users don’t know is that doi.org is not the only proxy server for DOIs. DOIs use the handle system and any handle server will resolve a DOI, just as doi.org will resolve any handle. This means that <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/10.5281/ZENODO.31780" xlink:type="simple">https://hdl.handle.net/10.5281/ZENODO.31780</ext-link> will resolve to the landing page for that DOI and that <ext-link ext-link-type="uri" xlink:href="http://doi.org/10273/BGRB5054RX05201" xlink:type="simple">http://doi.org/10273/BGRB5054RX05201</ext-link> is a handle (for a <ext-link ext-link-type="uri" xlink:href="http://www.igsn.org/" xlink:type="simple">IGSN</ext-link>) and not a DOI.</p>
47
+ </sec>
48
+ <sec id="sec-1.2">
49
+ <title>Prefix</title>
50
+ <p>The DOI prefix is used as a namespace so that DOIs are globally unique without requiring global coordination for every new identifier. Prefixes in the handle system and therefore for DOIs are numbers without any semantic meaning. One lesson learned with persistent identifiers is that adding meaning to the identifier (e.g. by using a prefix with the name of the data repository) is always dangerous, because – despite best intentions – all names can change over time.</p>
51
+ <p>Since the DOI prefix is a namespace to keep DOIs globally unique, there is usually no need for multiple prefixes for one organization managing DOI assignment. The tricky part is that these responsibilities can change, e.g. when an organization manages multiple repositories and one of them is migrated to another organization. It therefore makes sense to assign one prefix per list of resources that always stays together, e.g. one repository. It is possible that one prefix is managed by multiple organizations (as long as they use the same DOI registration agency), but that makes DOI management more complex.</p>
52
+ </sec>
53
+ <sec id="sec-1.3">
54
+ <title>Suffix</title>
55
+ <p>The suffix for a DOI can be (almost) any string. Which is both a feature and a curse. It is a feature because it gives maximal flexibility, for example when migrating existing identifiers to the DOI system. And it is a curse because it not always works well in the web context, as the list of characters allowed in a URL is limited. A good example of this are SICIs (<ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier" xlink:type="simple">Serial Item and Contribution Identifier</ext-link>), they were defined in 1996 before the DOI system was implemented, and could then be migrated to DOIs. Unfortunately they can contain many characters that are problematic in a URL or make it difficult to validate the DOI, as in <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7%3C672::aid-jpp192%3E3.0.co;2-8" xlink:type="simple">https://doi.org/10.1002/(sici)1099-1409(199908/10)3:6/7&lt;672::aid-jpp192&gt;3.0.co;2-8</ext-link>. A Crossref <ext-link ext-link-type="uri" xlink:href="http://blog.crossref.org/2015/08/doi-regular-expressions.html" xlink:type="simple">blog post</ext-link> by Andrew Gilmartin gives a good overview about the characters found in DOIs and suggests the following regular expression to check for valid DOIs:</p>
56
+ <pre><code>/^10.\d{4,9}/[-._;()/:A-Z0-9]+$/i</code></pre>
57
+ <p>SICIs demonstrate two other pitfalls:</p>
58
+ <list list-type="bullet">
59
+ <list-item>they contain semantic information (ISSN, volume, number, etc.) that may change over time, and</list-item>
60
+ <list-item>they are long, difficult to transcribe, with characters not allowed in URLs, and not very human-readable.</list-item>
61
+ </list>
62
+ <p>Semantic information might also lead users to expect certain functionalities. A common pattern that we see at DataCite is to include information about the version or parent in the suffix, e.g. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/M9.FIGSHARE.3501629.V1" xlink:type="simple">https://doi.org/10.6084/M9.FIGSHARE.3501629.V1</ext-link> or <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5061/DRYAD.0SN63/7" xlink:type="simple">https://doi.org/10.5061/DRYAD.0SN63/7</ext-link>. While the decision on what to put into the suffix is up to each data center, we should make sure users don't think that these are functionalities of the DOI system (e.g. that adding <bold>.V2</bold> to any DOI name will resolve to version 2 of that resource).</p>
63
+ <p>Another issue to keep in mind when assigning suffixes is that DOIs – in contrast to HTTP URIs – are case-insensitive, <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/ZENODO.31780" xlink:type="simple">https://doi.org/10.5281/ZENODO.31780</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.31780" xlink:type="simple">https://doi.org/10.5281/zenodo.31780</ext-link> are the same DOI. All DOIs are <ext-link ext-link-type="uri" xlink:href="https://www.doi.org/doi_handbook/2_Numbering.html#2.4" xlink:type="simple">converted to upper case</ext-link> upon registration and DOI resolution, but DOIs are not consistently displayed in such a way.</p>
64
+ </sec>
65
+ <sec id="sec-1.4">
66
+ <title>Generating cool DOIs</title>
67
+ <p>With all that, what should the ideal DOI look like? Its suffix should be:</p>
68
+ <list list-type="bullet">
69
+ <list-item>opaque without semantic information</list-item>
70
+ <list-item>work well in a web environment, avoiding characters problematic in URLs</list-item>
71
+ <list-item>short and human-readable</list-item>
72
+ <list-item>Resistant to transcription errors</list-item>
73
+ <list-item>easy to generate</list-item>
74
+ </list>
75
+ <p>On Tuesday DataCite released a tool that helps generating such a suffix, an open source command line tool called <ext-link ext-link-type="uri" xlink:href="https://github.com/datacite/cirneco" xlink:type="simple">cirneco</ext-link> (a lot of our open source software uses Italian dog breed names). Cirneco is a Ruby gem that can be installed via</p>
76
+ <pre><code>gem install cirneco</code></pre>
77
+ <p>Cirneco uses base32 encoding, as <ext-link ext-link-type="uri" xlink:href="http://www.crockford.com/wrmg/base32.html" xlink:type="simple">described</ext-link> by Douglas Crockford. The encoding starts with a randomly generated number to guarantee uniqueness of the identifier, and then encodes the number into a string that uses all numbers and uppercase letters. It avoids the letters I, O and L as they can be confused with the letter 1 and 0, using 32 characters (and 5 checksum characters) in total. The last character is a checksum. The resulting string from cirneco always has a length of 8 characters, in groups of 4 separated by a hyphen to help with readability. The advantage of base32 encoding over using only numbers (as for example ORCID is doing) is that the resulting string becomes much more compact, the available 7 characters (plus one for the checksum) can encode 34,359,738,367 strings, compared to 10 million when only using numbers. This number is large enough that the resulting suffix will not only be unique for a given prefix, but also unique for all DOIs (there is a very small chance to get the same random number twice, but this will be rejected when trying to register the DOI).</p>
78
+ <p>Another common way to generate random strings would have been universally unique identifiers (<ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Universally_unique_identifier" xlink:type="simple">UUID</ext-link>), but they are long and not very human-readable, e.g. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.4233/UUID:6D192FE2-DE18-4556-873A-D3CD56AB96A6" xlink:type="simple">https://doi.org/10.4233/UUID:6D192FE2-DE18-4556-873A-D3CD56AB96A6</ext-link>.</p>
79
+ <p>An example DOI generated by cirneco would be</p>
80
+ <pre><code>cirneco doi generate --prefix 10.5555
81
+ 10.5555/KVTD-VPWM</code></pre>
82
+ <p>The generated DOI is short enough that it should work well in places where space is limited, providing an alternative to the <ext-link ext-link-type="uri" xlink:href="http://shortdoi.org/" xlink:type="simple">ShortDOI</ext-link> service which shortens existing DOIs, but does this by adding another layer on top of the DOI proxy.</p>
83
+ <p>Another cirneco command checks that this is a valid bas32 string using the checksum</p>
84
+ <pre><code>cirneco doi check 10.5555/KVTD-VPWM
85
+ Checksum for 10.5555/KVTD-VPWM is valid</code></pre>
86
+ <p>This can be used to quickly verify a DOI, e.g. in a web form or API. The Ruby base32 encoding library used by cirneco is open source (<ext-link ext-link-type="uri" xlink:href="https://github.com/datacite/base32" xlink:type="simple">https://github.com/datacite/base32</ext-link>. I added the checksum to the existing library), and implementations of the Crockford base32 encoding pattern are available in many other languages, including <ext-link ext-link-type="uri" xlink:href="https://github.com/jbittel/base32-crockford" xlink:type="simple">Python</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://github.com/dflydev/dflydev-base32-crockford" xlink:type="simple">PHP</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://www.npmjs.com/package/base32-crockford" xlink:type="simple">Javascript</ext-link>, <ext-link ext-link-type="uri" xlink:href="http://stackoverflow.com/questions/22385467/crockford-base32-encoding-for-large-number-java-implementation" xlink:type="simple">Java</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://github.com/richardlehane/crock32" xlink:type="simple">Go</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://crockfordbase32.codeplex.com/" xlink:type="simple">.NET</ext-link>.</p>
87
+ <p>To answer the question raised at the beginning: a cool DOI is a DOI expressed as HTTPS URI using the doi.org proxy and using a base32-encoded suffix, for example <bold>https://doi.org/10.5555/KVTD-VPWM</bold>. This DOI works well in a web environment, is human readable, easy to parse and detect (e.g. in text mining), and can be generated using an algorithm that is well understood and supported.</p>
88
+ <fig id="g002"><caption>images/2016/12/cool-dois.png</caption><graphic mimetype="image" xlink:href="fig:" xlink:type="simple"/></fig>
89
+ </sec>
90
+ </body>
91
+ <back>
92
+ <ref-list><title>References</title><ref id="r001"><label>1</label>
93
+ <element-citation publication-type="standard"><person-group person-group-type="author"><name><surname>Berners-Lee</surname><given-names>Tim</given-names></name></person-group><article-title>Hypertext Style: Cool URIs don’t change.</article-title><date-in-citation content-type="access-date" iso-8601-date="2016-12-14"><day>14</day><month>12</month><year>2016</year></date-in-citation><ext-link ext-link-type="uri" xlink:href="https://www.w3.org/Provider/Style/URI" xlink:type="simple">https://www.w3.org/Provider/Style/URI</ext-link></element-citation>
94
+ </ref>
95
+ <ref id="r002"><label>2</label>
96
+ <element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Klein</surname><given-names>Martin</given-names></name><name><surname>Sompel</surname><given-names>Herbert Van de</given-names></name><name><surname>Sanderson</surname><given-names>Robert</given-names></name><name><surname>Shankar</surname><given-names>Harihar</given-names></name><name><surname>Balakireva</surname><given-names>Lyudmila</given-names></name><name><surname>Zhou</surname><given-names>Ke</given-names></name><name><surname>Tobin</surname><given-names>Richard</given-names></name></person-group><article-title>Scholarly Context Not Found: One in Five Articles Suffers from Reference Rot</article-title><source>PLOS ONE</source><date iso-8601-date="2014-12"><month>12</month><year>2014</year></date><volume>9</volume><issue>12</issue><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0115253" xlink:type="simple">10.1371/journal.pone.0115253</ext-link><ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253" xlink:type="simple">http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253</ext-link></element-citation>
97
+ </ref></ref-list></back>
98
+ </article>
data/spec/jats_spec.rb ADDED
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe Bergamasco::Jats do
4
+ subject { Bergamasco::Jats }
5
+
6
+ it 'should convert to jats' do
7
+ filepath = fixture_path + 'cool-dois.html.md'
8
+ file = IO.read(filepath)
9
+ xml = subject.render_jats(file, skip_yaml_header: true, csl: 'spec/fixtures/apa.csl', bibliography: 'spec/fixtures/references.yaml')
10
+ doc = Nokogiri::XML(xml)
11
+ article_id = doc.at_xpath("//article-id")
12
+ expect(article_id.text).to eq("10.23725/0000-03VC")
13
+ expect(article_id.values.first).to eq("doi")
14
+ end
15
+
16
+ it 'should write jats xml' do
17
+ filepath = fixture_path + 'cool-dois.html.md'
18
+ xml_path = subject.write_jats(filepath, skip_yaml_header: true, csl: 'spec/fixtures/apa.csl', bibliography: 'spec/fixtures/references.yaml')
19
+ doc = File.open(xml_path) { |f| Nokogiri::XML(f) }
20
+ article_id = doc.at_xpath("//article-id")
21
+ expect(article_id.text).to eq("10.23725/0000-03VC")
22
+ expect(article_id.values.first).to eq("doi")
23
+ end
24
+ end
@@ -0,0 +1,82 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN"
3
+ "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
4
+ <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="edtorial" dtd-version="1.1">
5
+ <front>
6
+ $if(publisher)$
7
+ <journal-meta>
8
+ <publisher>
9
+ <publisher-name>$publisher$</publisher-name>
10
+ </publisher>
11
+ </journal-meta>
12
+ $endif$
13
+ <article-meta>
14
+ $if(doi)$
15
+ <article-id pub-id-type="doi">$doi$</article-id>
16
+ $endif$
17
+ $if(tags)$
18
+ <article-categories>
19
+ <subj-group subj-group-type="categories">
20
+ $for(tags)$
21
+ <subject>$tags$</subject>
22
+ $endfor$
23
+ </subj-group>
24
+ </article-categories>
25
+ $endif$
26
+ $if(title)$
27
+ <title-group>
28
+ <article-title>$title$</article-title>
29
+ </title-group>
30
+ $endif$
31
+ $if(author)$
32
+ <contrib-group>
33
+ $for(author)$
34
+ <contrib contrib-type="author">
35
+ $if(author.orcid)$
36
+ <contrib-id contrib-id-type="orcid">$author.orcid$</contrib-id>
37
+ $endif$
38
+ <name>
39
+ $if(author.family_name)$
40
+ <surname>$author.family_name$</surname>
41
+ <given-names>$author.given_name$</given-names>
42
+ $else$
43
+ <string-name>$author$</string-name>
44
+ $endif$
45
+ </name>
46
+ </contrib>
47
+ $endfor$
48
+ </contrib-group>
49
+ $endif$
50
+ $if(date)$
51
+ <pub-date pub-type="epub" iso-8601-date="$date$">
52
+ $if(publication_day)$
53
+ <day>$publication_day$</day>
54
+ $endif$
55
+ $if(publication_month)$
56
+ <month>$publication_month$</month>
57
+ $endif$
58
+ $if(publication_year)$
59
+ <year>$publication_year$</year>
60
+ $else$
61
+ <string-date>$date$</string-date>
62
+ $endif$
63
+ </pub-date>
64
+ $endif$
65
+ $if(license_name)$
66
+ <permissions>
67
+ <license license-type="open-access" xlink:href="$license_url$">
68
+ <license-p>$license_name$</license-p>
69
+ </license>
70
+ </permissions>
71
+ $endif$
72
+ $if(subjects)$
73
+ <kwd-group kwd-group-type="author">
74
+ $for(subjects)$
75
+ <kwd>$subjects$</kwd>
76
+ $endfor$
77
+ </kwd-group>
78
+ $endif$
79
+ </article-meta>
80
+ </front>
81
+ $body$
82
+ </article>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bergamasco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.14
4
+ version: '0.3'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Fenner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-19 00:00:00.000000000 Z
11
+ date: 2016-12-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -267,6 +267,9 @@ files:
267
267
  - Rakefile
268
268
  - bergamasco.gemspec
269
269
  - lib/bergamasco.rb
270
+ - lib/bergamasco/jats.csl
271
+ - lib/bergamasco/jats.lua
272
+ - lib/bergamasco/jats.rb
270
273
  - lib/bergamasco/markdown.rb
271
274
  - lib/bergamasco/sanitize.rb
272
275
  - lib/bergamasco/summarize.rb
@@ -276,13 +279,16 @@ files:
276
279
  - spec/fixtures/apa.csl
277
280
  - spec/fixtures/cool-dois-without-yml.md
278
281
  - spec/fixtures/cool-dois.html.md
282
+ - spec/fixtures/cool-dois.xml
279
283
  - spec/fixtures/cool-dois.yml
280
284
  - spec/fixtures/references.bib
281
285
  - spec/fixtures/references.yaml
286
+ - spec/jats_spec.rb
282
287
  - spec/markdown_spec.rb
283
288
  - spec/sanitize_spec.rb
284
289
  - spec/spec_helper.rb
285
290
  - spec/summarize_spec.rb
291
+ - templates/default.jats
286
292
  homepage: https://github.com/datacite/bergamasco
287
293
  licenses:
288
294
  - MIT