rakali 0.0.15 → 0.0.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cecffbfec2462ddcd3859923599fa16680ea7331
4
- data.tar.gz: c09b0a2add3793a8f02b3ec88a7413613c9bbdf6
3
+ metadata.gz: 2eb9ce467fe5995f4ccdf923e5fadc8565fa92cc
4
+ data.tar.gz: 591d43f84a478ff6aee2640e0d20b6e8495b6109
5
5
  SHA512:
6
- metadata.gz: 1768d9530d2a2de035237548852399c4b4bc1c1ceb12d0f01ec94ab43ec08096449306199f5ecd2d794053d07c790ac31bdf5f2e45769b4dc07d2930efed0b88
7
- data.tar.gz: bee23522cb306f598561cfd96ffb5a5c14027d3d42c0483df75d29d93210498069fa76a37c2c6fa6e56f916034cd368d4ce8960ae27c361f363bde0cbcb24307
6
+ metadata.gz: 612720c0b05c2890eca185903166f34272fa0225db753aac244b970a32852661275a86500180563ff866fa83d543c7ce45029c3aaf9ed01ae72749d25917ceb8
7
+ data.tar.gz: 57724aa15bed4025549c45d274d3298e9552ada75fe62fef0b783bdde5fd1b8a9fabd0e8edd083d51bb90b25cf63427a3f54d9663f9765911fae0c91bea544f4
@@ -0,0 +1,8 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
4
+ *.html
5
+ *.epub
6
+ *.jats
7
+ spec/fixtures/*.html
8
+ spec/fixtures/*.docx
@@ -0,0 +1,6 @@
1
+ from:
2
+ folder: examples
3
+ format: md
4
+ to:
5
+ format: epub
6
+ merge: true
@@ -0,0 +1,22 @@
1
+ language: haskell
2
+
3
+ rvm:
4
+ - 1.9.3
5
+ - 2.1.2
6
+
7
+ install:
8
+ - travis_retry cabal install pandoc pandoc-citeproc
9
+ - travis_retry bundle install
10
+
11
+ script:
12
+ - bundle exec rake
13
+ - bundle exec rakali convert .rakali.yml
14
+
15
+ deploy:
16
+ provider: rubygems
17
+ api_key:
18
+ secure: Gcr3lbeTuQW0MXpO9sh2lnYN4EY9FFLaBhF9RK99JjGUYAm7HR70yWM/EhAThWtfjVhzwcjzOJ6RrGE401zVRUsye8GTMXA5d7USx1KGGmWNRG5cYTCBWhymHXv4vfSfMp3CP2FgyfcTigXu8yHh18ONpHIhbBuZNx1DliBEPgU=
19
+ gem: rakali
20
+ on:
21
+ tags: true
22
+ repo: rakali/rakali.rb
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,54 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rakali (0.0.17)
5
+ colorator (~> 0.1)
6
+ json-schema (~> 2.2)
7
+ safe_yaml (~> 1.0)
8
+ thor (~> 0.19)
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ aruba (0.6.0)
14
+ childprocess (>= 0.3.6)
15
+ cucumber (>= 1.1.1)
16
+ rspec-expectations (>= 2.7.0)
17
+ builder (3.2.2)
18
+ childprocess (0.5.3)
19
+ ffi (~> 1.0, >= 1.0.11)
20
+ colorator (0.1)
21
+ cucumber (1.3.16)
22
+ builder (>= 2.1.2)
23
+ diff-lcs (>= 1.1.3)
24
+ gherkin (~> 2.12)
25
+ multi_json (>= 1.7.5, < 2.0)
26
+ multi_test (>= 0.1.1)
27
+ diff-lcs (1.2.5)
28
+ ffi (1.9.3)
29
+ gherkin (2.12.2)
30
+ multi_json (~> 1.3)
31
+ json-schema (2.2.4)
32
+ multi_json (1.10.1)
33
+ multi_test (0.1.1)
34
+ rake (0.9.6)
35
+ rspec (2.99.0)
36
+ rspec-core (~> 2.99.0)
37
+ rspec-expectations (~> 2.99.0)
38
+ rspec-mocks (~> 2.99.0)
39
+ rspec-core (2.99.1)
40
+ rspec-expectations (2.99.2)
41
+ diff-lcs (>= 1.1.3, < 2.0)
42
+ rspec-mocks (2.99.2)
43
+ safe_yaml (1.0.3)
44
+ thor (0.19.1)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ aruba (~> 0)
51
+ cucumber (~> 1.3)
52
+ rakali!
53
+ rake (~> 0)
54
+ rspec (~> 2.6)
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Rakali
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ rakali.rb
2
+ =========
3
+
4
+ [![Build Status](https://travis-ci.org/rakali/rakali.rb.svg)](https://travis-ci.org/rakali/rakali.rb)
5
+ [![Gem Version](https://badge.fury.io/rb/rakali.svg)](http://badge.fury.io/rb/rakali)
6
+ [![Code Climate](https://codeclimate.com/github/rakali/rakali.rb.png)](https://codeclimate.com/github/rakali/rakali.rb)
@@ -0,0 +1,10 @@
1
+ require 'bundler'
2
+ require 'rake'
3
+ require 'yaml'
4
+ require 'rspec/core/rake_task'
5
+
6
+ Bundler::GemHelper.install_tasks
7
+ RSpec::Core::RakeTask.new('spec')
8
+
9
+ # default task is running rspec tests
10
+ task :default => :spec
@@ -0,0 +1,54 @@
1
+ ---
2
+ layout: post
3
+ title: "The Grammar of Scholarly Communication"
4
+ tags: [markdown, authoring]
5
+ ---
6
+
7
+ Authoring of scholarly articles is a recurring theme in this blog since it started in 2008. Authoring is still in desperate need for improvement, and nobody has convincingly figured out how to solve this problem.<!--more--> Authoring involves several steps, and it helps to think about them separately:
8
+
9
+ * **Writing**. Manuscript writing, including formatting, collaborative authoring
10
+ * **Submission**. Formatting a manuscript according to a publisher's author guidelines, and handing it over to a publishing platform
11
+ * **Revision**. Changes made to a manuscript in the peer review process, or after publication
12
+
13
+ Although authoring typically involves text, similar issues arise for other research outputs, e.g research data. And these considerations are also relevant for other forms of publishing, whether it is self-publication on a blog or website, or publishing of preprints and white papers.
14
+
15
+ ![Flickr photo by [citnaj](http://www.flickr.com/photos/citnaj/1278021067/).](/images/grammar.jpg)
16
+
17
+ For me the main challenge in authoring is to go from human-readable unstructured content to highly structured machine-readable content. We could make authoring simpler by either forgoing any structure and just publishing in any format we want, or we can force authors to structure their manuscripts according to a very specific set of rules. The former doesn't seem to be an option, not only do we have a set of community standards that have evolved for a very long time (research articles for example have title, authors, results, references, etc.), but it also makes it hard to find and reuse scholarly research by others.
18
+
19
+ The latter option is also not really viable since most researchers haven't learned to produce their research outputs in machine-readable highly standardized formats. There are some exceptions, e.g. [CONSORT](http://www.consort-statement.org/) and other reporting standards in clinical medicine or the [semantic publishing in Crystallography](http://blogs.ch.cam.ac.uk/pmr/2012/01/23/brian-mcmahon-publishing-semantic-crystallography-every-science-data-publisher-should-watch-this-all-the-way-through/), but for the most part research outputs are to diverse to easily find a format that works for all of them. The current trend is certainly towards machine-readable rather than towards human-readable, but there is still a significant gap - scholarly articles are transformed from documents in Microsoft Word (or sometimes LaTeX) format into XML (for most biomedical research that means [JATS](http://jats.nlm.nih.gov/publishing/)) using kludgy tools and lots of manual labor.
20
+
21
+ What solutions have been tried to overcome the limitations of our current authoring tools, and to make the process more enjoyable for authors and more productive for publishers?
22
+
23
+ 1. Do the conversion manually, still a common workflow.
24
+ 2. Tools for publishers such as [eXtyles](http://blogs.plos.org/mfenner/2009/05/01/extyles_interview_with_elizabeth_blake_and_bruce_rosenblum/), [Merops](http://www.shabash.net/merops/) - both commercial - or the evolving Open Source [mPach](http://www.lib.umich.edu/mpach/modules) that convert Microsoft Word documents into JATS XML and do a lot of automated checks along the way.
25
+ 3. Tools for authors that directly generate JATS XML, either as a Microsoft Word plugin (the [Article Authoring Add-In](http://blogs.nature.com/mfenner/2008/11/07/interview-with-pablo-fernicola), not actively maintained) in the browser (e.g. [Lemon8-XML](http://blogs.plos.org/mfenner/2009/02/27/lemon8_xml_interview_with_mj_suhonos/), not actively maintained), or directly in a publishing platform such as Wordpress ([Annotum](http://annotum.org/)).
26
+ 4. Forget about XML and use HTML5 has the canocical file format, e.g. as [Scholarly HTML](http://blogs.plos.org/mfenner/2011/03/19/a-very-brief-history-of-scholarly-html/) or HTML5 specifications such as [HTMLBook](https://github.com/oreillymedia/HTMLBook/blob/master/specification.asciidoc). Please read Molly Sharp's [blog post](http://blogs.plos.org/tech/structured-documents-for-science-jats-xml-as-canonical-content-format/) for background information about HTML as an alternative to XML.
27
+ 5. Use file formats for authoring that are a better fit for the requirements of scholarly authors, in particular [Scholarly Markdown](http://blog.martinfenner.org/2012/12/13/a-call-for-scholarly-markdown/).
28
+ 6. Build online editors for scientific content that hide the underlying file format, and guide users towards a structured format, e.g. by not allowing input that doesn't conform to specifications.
29
+
30
+ **Solution 1.** isn't really an option, as it makes scholarly publishing unnecessarily slow and expensive. Typesetter Kaveh Bazergan has gone on record at the [SpotOn London Conference 2012](http://www.nature.com/spoton/2012/11/spoton-london-2012-a-global-conference/) by saying that the current process is insane and that he wants to be "put out of business".
31
+
32
+ **Solution 2.** is probably the most commonly used workflow used by larger publishers today, but is very much centered around a Microsoft Word to XML workflow. LaTeX is a popular authoring environment in some disciplines, but still requires work to convert documents into web-friendly formats such as HTML and XML.
33
+
34
+ **Solutions 3. to 5.** have never picked up any significant traction. Overall the progress in this area has been modest at best, and the mainstream of authoring today isn't too different from 20 years ago. Although I have gone on record for saying that [Scholarly Markdown](/tags.html#markdown-ref) has a lot of potential, the problem is much bigger than finding a single file format, and markdown will never be the solution for all authoring needs.
35
+
36
+ **Solution 6.** is an area where a lot of exciting development is currently happening, examples include [Authorea](https://www.authorea.com/), [WriteLateX](https://www.writelatex.com/), [ShareLaTeX](https://www.sharelatex.com/). Although the future of scholarly authoring will certainly include online authoring tools (making it much easier to collaborate, one of the authoring pain points), we run the risk of locking in users into one particular authoring environment.
37
+
38
+ ### Going Forward
39
+
40
+ How can we move forward? I would suggest the following:
41
+
42
+ 1. Publishers should accept manuscripts in any reasonable file format, which means at least Microsoft Word, Open Office, LaTeX, Markdown, HTML and PDF, but possibly more. This will create a lot of extra work for publishers, but will open the doors for innovation, both in the academic and commercial sector. We will never see significant progress in scholarly authoring tools if the submission step requires manuscripts to be in a single file format (Microsoft Word) - in particular since this file format is a general purpose word processsing format and not something designed specifically for scholarly content. And we want researchers to spend their time doing research and writing up their research, not formatting documents.
43
+ 2. To handle this avalanche of unstructured documents, publishers need conversion tools that can transform all these documents into a format that can feed into their editorial and publishing workflows. A limited number of these tools exist already, but this will require a significant development effort. Again, opening up submissions to a variety of file formats will not only foster innovation in authoring tools, but also in document conversion tools.
44
+ 3. We should think beyond XML. Many of the workflows designed today center around conversions from one XML format to another, e.g. Microsoft Word to JATS or [TEI](http://www.tei-c.org/index.xml) (popular in the humanities), often using XLST transforms. Not only is XML difficult for humans to read or edit, but the web and many of the technologies built around it are moving away from XML towards HTML5 and JSON. XML is fine as an important output format for publishing, but maybe not the best format to hold everything together.
45
+ 4. As we haven't come up with a canoical file format for scholarly documents by now, we should give up that idea. XML is great for publisher workflows, but is not something humans can easily edit or read. PDF is still the most widely read format by humans, but is not a good intermediary format. LaTeX is too complex for authors outside of mathematics, physics and related fields, and is not built with web standards in mind. Markdown is promising, but doesn't easily support highly structured content. And HTML5 and the related ePub are widely popular, but can be hard to edit without a visual editor, and currently don't include enough standard metadata to support scholarly content out of the box.
46
+ 5. The focus should not be on canonical file formats for scholarly documents, but on tools that understand the manuscripts created by researchers and can transform them into something more structured. As we have learned from document conversion tools such as [Pandoc](http://johnmacfarlane.net/pandoc/), we can't do this with a simple find and replace using regular expressions, but need a more structured approach. Pandoc is taking the input document (markdown, LaTeX or HTML) apart and is constructing an abstract syntax tree ([AST](http://en.wikipedia.org/wiki/Abstract_syntax_tree)) of the document, using parsing expression grammar ([PEG](http://en.wikipedia.org/wiki/Parsing_expression_grammar)), which includes a set of parsing rules. Parsing expression grammars are fairly new, [first described by Bryan Ford](http://bford.info/pub/lang/peg) about 10 years ago, but in my mind are a very good fit for the formal grammar of scientific documents. It should be fairly straightforward to generate a variety of output formats from the AST (Pandoc can convert into more than 30 document formats), the hard part is the parsing of the input.
47
+
48
+ All this requires a lot of work. Pandoc is a good model to start, but is written in Haskell, a functional programming language that not many people are familar with. For small changes Pandoc allows you to directly manipulate the AST (represented as JSON) using [filters](http://johnmacfarlane.net/pandoc/scripting.html) written in Haskell or Python. And [custom writers](https://github.com/jgm/pandoc) for other document formats can be written using [Lua](http://www.lua.org/), another interesting programming language that not many people know about. Lua is a fast and relatively easy to learn scripting language that can be easily embedded into other languages, and for similar reasons is also used to [extend the functionality of Wikipedia](http://en.wikipedia.org/wiki/Wikipedia:Lua). PEG parsers in other languages include [Treetop](http://treetop.rubyforge.org/) (Ruby), [PEG.js](http://pegjs.majda.cz/) (Javascript), and [ANTLR](http://www.antlr.org/), a popular parser generator that also includes PEG features.
49
+
50
+ But I think the effort to build a solid open source conversion tool for scholarly documents is worth it, in particular for smaller publishers and publishing platforms who can't afford the commercial Microsoft Word to JATS conversion tools. We shouldn't take any shortcuts - e.g. by focussing on XML and XLST transforms - and we can improve this tool over time, e.g. by starting with a few input and output formats. This tool will be valuable beyond authoring, as it can also be very helpful to convert published scholarly content into other formats such as ePub, and in text mining, which in many ways tries to solve many of the same problems. The [Pandoc documentation](http://johnmacfarlane.net/pandoc/scripting.html) includes an example of extracting all URLs out of a document, and this can be modified to extract other content. In case you wonder whether I gave up on the idea of [Scholarly Markdown](/tags.html#markdown-ref) - not at all. To me this is a logical next step, opening up journal submission systems to Scholarly Markdown and other evolving file formats. And Pandoc, one of the most interesting tools in this space, is a markdown conversion tool at its heart. The next steps could be the following:
51
+
52
+ * write a custom writer in Lua that generates JATS output from Pandoc
53
+ * explore how difficult it would be to add Microsoft Word .docx as Pandoc input format
54
+ * develop Pandoc filters relevant for scholarly documents (e.g. [auto-linking accession numbers of biomedical databases](/2013/07/02/auto-generating-links-to-data-and-resources/))
@@ -0,0 +1,24 @@
1
+ ---
2
+ layout: post
3
+ title: "From Markdown to JATS XML in one Step"
4
+ tags: [markdown, jats, pandoc]
5
+ ---
6
+
7
+ The Journal Article Tag Suite ([JATS](http://jats.nlm.nih.gov/)) is a NISO standard that defines a set of XML elements and attributes for tagging journal articles. JATS is not only used for fulltext content at PubMed Central (and JATS has evolved from the NLM Archiving and Interchange Tag Suite originally developed for PubMed Central), but is also increasinly used by publishers.<!--more-->
8
+
9
+ For many publishers the *version of record* of an article is stored in XML, and other formats (currently HTML, PDF and increasingly ePub) are generated from this XML. Unfortunately the process of converting author-submitted manuscripts into JATS-compliant XML is time-consuming and costly, and this is a problem in particular for small publishers.
10
+
11
+ In a recent blog post ([The Grammar of Scholarly Communication](/2013/11/17/the-grammar-of-scholarly-communication/)) I argued that publishers should accept manuscripts in any reasonable file format, including Microsoft Word, Open Office, LaTeX, Markdown, HTML and PDF. Readers of this blog know that I am a big fan of [markdown](/tags.html#markdown-ref) for scholarly documents, but I am of course well aware that at the end of the day these documents have to be converted into JATS.
12
+
13
+ As a small step towards that goal I have today released the first public version of [pandoc-jats](https://github.com/mfenner/pandoc-jats), a [custom writer for Pandoc](http://johnmacfarlane.net/pandoc/README.html#custom-writers) that converts markdown documents into JATS XML with a single command, e.g.
14
+
15
+ pandoc -f example.md --filter pandoc-citeproc --bibliography=example.bib --csl=apa.csl -t JATS.lua -o example.xml
16
+
17
+ Please see the [pandoc-jats](https://github.com/mfenner/pandoc-jats) Github repository for more detailed information, but using this custom writer is as simple as downloading a single `JATS.lua`file. The big challenge is of course to make this custom writer work with as many documents as possible, and that will be my job the next few weeks. Two example JATS documents are below (both markdown versions of scholarly articles and posted on this blog as HTML):
18
+
19
+ * Nine simple ways to make it easier to (re)use your data ([HTML](/2013/06/25/nine-simple-ways-to-make-it-easier-to-reuse-your-data/), [JATS](/files/10.7287.peerj.preprints.7v2.xml))
20
+ * What Can Article Level Metrics Do for You? ([HTML](/2013/12/11/what-can-article-level-metrics-do-for-you/), [JATS](/files/10.1371.journal.pbio.1001687.xml))
21
+
22
+ Both JATS files were validated against the JATS DTD and XSD and showed no errors with the NLM XML StyleChecker - using the excellent [jats-conversion](https://github.com/PeerJ/jats-conversion) conversion and validation tools written by Alf Eaton. Markdown is actually a nice file format to convert to XML - in contrast to HTML authors can't for example put closing tags at the wrong places. And a Pandoc custom writer written in the Lua scripting language is an interesting alternative to XSLT transformations, the more common way to create JATS XML. The custom writer has not been tested with other Pandoc input formats besides markdown, of particular interest are of course HTML and LaTeX - Microsoft Word .docx is unfortunately only a Pandoc output format.
23
+
24
+ This is the first public release and there is of course a lot of room for improvement. Many elements and attributes are not yet supported - although [ORCID author identifiers](http://orcid.org/blog/2013/03/22/orcid-how-more-specifying-orcid-ids-document-metadata) are of course included. Please help me improve this tool using the Github [Issue Tracker](https://github.com/mfenner/pandoc-jats/issues).
@@ -0,0 +1,55 @@
1
+ ---
2
+ layout: post
3
+ title: Don't Reinvent the Wheel
4
+ tags: [citeproc, crossref]
5
+ ---
6
+ In a [post last week](/2014/07/18/roads-not-stagecoaches/) I talked about roads and stagecoaches, and how work on scholarly infrastructure can often be more important than building customer-facing apps. One important aspect of that infrastruture work is to not duplicate efforts.<!--more-->
7
+
8
+ ![Image by Cocoabiscuit [on Flickr](http://www.flickr.com/photos/jfgallery/5673321593/)](/images/5673321593_e6a7faa36d_z.jpg)
9
+
10
+ A good example is information (or metadata) about scholarly publications. I am the technical lead for the open source [article-level metrics (ALM) software](http://articlemetrics.github.io/). This software can be used in different ways, but most people use it for tracking the metrics of scholarly articles, with articles that have DOIs issued by CrossRef. The ALM software needs three pieces of information for every article: **DOI**, **publication date**, and **title**. This information can be entered via a web interface, but that is of course not very practical for adding dozens or hundreds of articles at a time. The ALM software has therefore long supported the import of multiple articles via a text file and the command line.
11
+
12
+ This approach is working fine for the ALM software [running at PLOS since 2009](http://articlemetrics.github.io/plos/), but is for example a problem if the ALM software runs as a service for multiple publishers. A more flexible approach is to provide an API to upload articles, and I've [added an API](http://articlemetrics.github.io/docs/api/) for creating, updating and deleting articles in January 2014.
13
+
14
+ While the API is an improvement, it still requires the integration into a number of possibly very different publisher workflows, and you have to deal with setting up the permissions, e.g. so that publisher A can't delete an article from publisher B.
15
+
16
+ The next ALM release (3.3) will therefore add a third approach to importing articles: using the [CrossRef API](http://api.crossref.org) to look up article information. Article-level metrics is about tracking already published works, so we really only care about articles that have DOIs registered with CrossRef and are therefore published. ALM is now talking to a single API, and this makes it much easier to do this for a number of publishers without writing custom code. Since ALM is an open source application already used by several publishers that aspect is important. And because we are importing, we have don't have to worry about permissions. The only requirement is that CrossRef has the correct article information, and has this information as soon as possible after publication.
17
+
18
+ At this point I have a confession to make: I regularly use other CrossRef APIs, but wasn't aware of **api.crossref.org** until fairly recently. That is sort of understandable since the reference platform was deployed only September last year. The documentation to get you started is on [Github](https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md) and the version history shows frequent API updates (now at v22). The API will return all kinds of information, e.g.
19
+
20
+ * how many articles has publisher x published in 2012
21
+ * percentage of DOIs of publisher Y that include at least one ORCID identifier
22
+ * list all books with a Creative Commons CC-BY license that were published this year
23
+
24
+ Funder (via FundRef) information is also included, but is still incomplete. Another interesting result is the number of [component DOIs](http://blogs.plos.org/mfenner/2011/03/26/direct-links-to-figures-and-tables-using-component-dois/) (DOIs for figures, tables or other parts of a document) per year:
25
+
26
+ <iframe src="http://cf.datawrapper.de/Ze7et/1/" frameborder="0" allowtransparency="true" allowfullscreen="allowfullscreen" webkitallowfullscreen="webkitallowfullscreen" mozallowfullscreen="mozallowfullscreen" oallowfullscreen="oallowfullscreen" msallowfullscreen="msallowfullscreen" width="640" height="480"></iframe>
27
+
28
+ For my specific use case I wanted an API call that returns all articles published by PLOS (or any other publisher) in the last day which I can then run regularly. To get all DOIs from a specific publisher, use their CrossRef member ID - DOI prefixes don't work, as publishers can own more than one DOI prefix. To make this task a little easier I built a CrossRef member search interface into the ALM application:
29
+
30
+ ![](/images/crossref_api.png)
31
+
32
+ We can filter API responses by publication date, but it is a better idea to use the update date, as it is possible that the metadata have changed, e.g. a correction of the title. We also want to increase the number of results per page (using the `rows` parameter). The final API call for all DOIs updated by PLOS since the beginning of the week would be
33
+
34
+ ```
35
+ http://api.crossref.org/members/340/works?filter=from-update-date:2014-07-21,until-update-date:2014-07-24&rows=1000
36
+ ```
37
+
38
+ The next step is of course to parse the JSON of the API response, and you will notice that CrossRef is using [Citeproc JSON](http://gsl-nagoya-u.net/http/pub/citeproc-doc.html). This is a standard JSON format for bibliographic information used internally by several reference managers for citation styles, but increasingly also by APIs and other places where you encounter bibliographic information.
39
+
40
+ Citeproc JSON is helpful for one particular problem with CrossRef metadata: the exact publication date for an article is not always known, and CrossRef (and similarly DataCite) only requires the publication year. Citeproc JSON can nicely handle partial dates, e.g. year-month:
41
+
42
+ ```
43
+ issued: {
44
+ date-parts: [
45
+ [
46
+ 2014,
47
+ 7
48
+ ]
49
+ ]
50
+ },
51
+ ```
52
+
53
+ I think that a similar approach will work for many other systems that require bibliographic information about scholarly content with CrossRef DOIs. If are not already using **api.crossref.org**, consider integrating with it, I find the API fast, well documented, easy to use - and CrossRef is very responsive to feedback. As you can always wish for more, I would like to see the following: fix the problem were some journal articles are missing the publication date (a required field, even if only the year), and consider adding the canonical URL to the article metadata (which ALM currently has to look up itself, and which is needed to track social media coverage of an article).
54
+
55
+ *Update July 24, 2014: added chart with number of component DOIs per year*
@@ -0,0 +1,88 @@
1
+ ---
2
+ layout: post
3
+ title: What is a DOI?
4
+ tags: [doi, wikimania]
5
+ ---
6
+
7
+ This Sunday [Ian Mulvany](https://twitter.com/ianmulvany) and I will do a presentation on [Open Scholarship Tools](http://wikimania2014.wikimedia.org/wiki/Submissions/Open_Scholarship_Tools_-_a_whirlwind_tour.) at *Wikimania 2014* in London.<!--more--> From the abstract:
8
+
9
+ > This presentation will give a broad overview of tools and standards that are helping with Open Scholarship today.
10
+
11
+ One of the four broad topics we have picked are *digital object identifiers (DOI)s*. We want to introduce them to people new to them, and we want to show some tricks and cool things to people who already now them. Along the way we will also try to debunk some myths about DOIs.
12
+
13
+ ### What a DOI looks like
14
+
15
+ DOIs - or better DOI names - start with a prefix in the format `10.x` where x is 4-5 digits. The suffix is determined by the organization registering the DOI, and there is no consistent pattern across organizations. The DOI name is typically expressed as a URL (see below). An example DOI would look like: [http://dx.doi.org/10.5555/12345678](http://dx.doi.org/10.5555/12345678). Something in the format **10/hvx** or [http://doi.org/hvx](http://doi.org/hvx) is a [shortDOI](http://shortdoi.org/), and **1721.1/26698** or [http://hdl.handle.net/1721.1/26698](http://hdl.handle.net/1721.1/26698) is a handle. BTW, all DOIs names are also handles, so [http://hdl.handle.net/10/hvx](http://hdl.handle.net/10/hvx) for the shortDOI example above will resolve correctly.
16
+
17
+ ### DOIs are persistent identifiers
18
+
19
+ Links to resources can change, particularly over long periods of time. Persistent identifiers are needed so that readers can still find the content we reference in a scholarly work (or anything else where persistent linking is important) 10 or 50 years later. There are many kinds of persistent identifiers, one of the key concepts - and a major difference to URLs - is to separate the identifier for the resource from its location. Persistent identifiers require technical infrastructure to resolve identifiers (DOIs use the [Handle System](http://www.handle.net/)) and to allow long-term archiving of resources. DOI registration agencies such as DataCite or CrossRef are required to provide that persistence. Other persistent identifier schemes besides DOIs include [persistent uniform resource locators (PURLs)](http://en.wikipedia.org/wiki/PURL) and [Archival Resource Keys (ARKs)](http://en.wikipedia.org/wiki/Archival_Resource_Key).
20
+
21
+ ### DOIs have attached metadata
22
+
23
+ All DOIs have metadata attached to them. The metadata are supplied by the resource provider, e.g. publisher, and exposed in services run by registration agencies, for example metadata search and content negotiation (see below). There is a minimal set of required metadata for every DOI, but beyond that, different registration agencies will use different metadata schemata, and most metadata are optional. Metadata are important to build centralized discovery services, making it easier to describe a resource, e.g. journal article citing another article. Some of the more recent additions to metadata schemata include persistent identifiers for people ([ORCID](http://orcid.org/)) and funding agencies ([FundRef](http://www.crossref.org/fundref/)), and license information. The following API call will retrieve all publications registered with CrossRef that use a [Creative Commons Attribution license](http://creativecommons.org/licenses/by/3.0/deed.en_US) (and where this information has been provided by the publisher):
24
+
25
+ ```
26
+ http://api.crossref.org/funders/10.13039/100000001/works?filter=license.url:http://creativecommons.org/licenses/by/3.0/deed.en_US
27
+ ```
28
+
29
+ ### DOIs support link tracking
30
+
31
+ Links to other resources are an important part of the metadata, and describing all citations between a large number scholarly documents is a task that can only really be accomplished by a central resource. To solve this very problem DOIs were invented and the CrossRef organization started around 15 years ago.
32
+
33
+ ### Not every DOI is the same
34
+
35
+ The DOI system [originated from an initiative by scholarly publishers](http://www.doi.org/doi_handbook/1_Introduction.html) (first announced at the Frankfurt Book Fair in 1997), with citation linking of journal articles its first application. This citation linking system is managed by [CrossRef](http://www.crossref.org/), a non-profit member organization of scholarly publishers, and [more than half](http://search.crossref.org/help/status) of the about [100 million DOIs](http://www.doi.org/faq.html) that have been assigned to date are managed by them.
36
+
37
+ But many DOIs are assigned by one of the other 8 [registration agencies](http://www.doi.org/RA_Coverage.html). You probably know [DataCite](http://www.datacite.org/), but did you know that the [Publications Office of the European Union (OP)](http://publications.europa.eu/index_en.htm) and the [Entertainment Identifier Registry (EIDR)](http://www.eidr.org/) also assign DOIs? The distinction is important, because some of the functionality is a service of the registration agency - metadata search for example is offered by CrossRef ([http://search.crossref.org](http://search.crossref.org)) and DataCite ([http://search.datacite.org](http://search.datacite.org)), but you can't search for a DataCite DOI in the CrossRef metadata search. There is an API to find out the registration agency behind a DOI so that you know what services to expect:
38
+
39
+ ```
40
+ http://api.crossref.org/works/10.6084/m9.figshare.821213/agency
41
+
42
+ {
43
+ "status": "ok",
44
+ "message-type": "work-agency",
45
+ "message-version": "1.0.0",
46
+ "message": {
47
+ "DOI": "10.6084/m9.figshare.821213",
48
+ "agency": {
49
+ "id": "datacite",
50
+ "label": "DataCite"
51
+ }
52
+ }
53
+ }
54
+ ```
55
+
56
+ ### DOIs are URLs
57
+
58
+ [DOI names may be expressed as URLs (URIs) through a HTTP proxy server](http://www.doi.org/faq.html) - e.g. [http://dx.doi.org/10.5555/12345679 ](http://dx.doi.org/10.5555/12345679), and this is how DOIs are typically resolved. For this reason the [CrossRef DOI Display Guidelines](http://www.crossref.org/02publishers/doi_display_guidelines.htm) recommend that *CrossRef DOIs should always be displayed as permanent URLs in the online environment*. Because DOIs can be expressed as URLs, they also have their features:
59
+
60
+ #### Special characters
61
+
62
+ Because DOIs can be expressed as URLs, DOIs [should only include characters allowed in URLs](http://www.crossref.org/02publishers/15doi_guidelines.html), something that wasn't always true in the past and can cause problems, e.g. when using SICIs ([Serial Item and Contribution Identifier](https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier)), an extension of the ISSN for journals:
63
+
64
+ ```
65
+ 10.4567/0361-9230(1997)42:<OaEoSR>2.0.TX;2-B
66
+ ```
67
+
68
+ #### Content negotiation
69
+
70
+ The DOI resolver at *doi.org* (or *dx.doi.org*) normally resolves to the resource location, e.g. a landing page at a publisher website. Requests that are not for content type `text/html` are redirected to the registration agency metadata service (currently for CrossRef, DataCite and mEDRA DOIs). Using [content negotiation](http://www.crosscite.org/cn/), we can ask the metadata service to send us the metadata in a format we specify (e.g. Citeproc JSON, bibtex or even a formatted citation in one of thousands of citation styles) instead of getting redirected to the resource. This is a great way to collect bibliographic information, e.g. to format citations for a manuscript. In theory we could also use content negotiation to get a particular representation of a resource, e.g. `application/pdf` for a PDF of a paper or `text/csv` for a dataset in CSV format. This is not widely support and I don't know the details of the implementation in the DOI resolver, but you can try this (content negotation is easier with the command line than with a browser):
71
+
72
+ ```
73
+ curl -LH "Accept: application/pdf" http://dx.doi.org/10.7717/peerj.500 >peerj.500.pdf
74
+ ```
75
+
76
+ This will save the PDF of the 500th PeerJ paper published last week.
77
+
78
+ #### Fragment identifiers
79
+
80
+ As discussed in [my last blog post](http://blog.martinfenner.org/2014/08/02/fragment-identifiers-and-dois/), we can use frament identifiers to subsections of a document with DOIs, e.g. [http://dx.doi.org/10.1371/journal.pone.0103437#s2](http://dx.doi.org/10.1371/journal.pone.0103437#s2) or [http://doi.org/10.5446/12780#t=00:20,00:27](http://doi.org/10.5446/12780#t=00:20,00:27), just as we can with every other URL. This is a nice way to directly link to a specific document section, e.g. when discussing a paper on Twitter. Fragment identifiers are implemented by the client (typically web browser) and depend on the document type, but for DOIs that resolve to fulltext HTML documents they can add granularity to the DOI without much effort.
81
+
82
+ #### Queries
83
+
84
+ URLs obviously support queries, but that is a feature I haven't yet seen with DOIs. Queries would allow interesting features, partly overlapping with what is possible with fragment identifiers and content negotiation, e.g. `http://dx.doi.org/10.7717/peerj.500?format=pdf`. II hope to find out more until Sunday.
85
+
86
+ ### Outlook
87
+
88
+ My biggest wish? Make DOIs more machine-readable. They are primarily intended for human users, enabling them to find the content associated with a DOI. But they sometimes don't work as well as they could with automated tools, one example are the [challenges automatically resolving a DOI](http://blog.martinfenner.org/2013/10/13/broken-dois/) that I described in a blog post last year. Thinking about DOIs as URLs - and using them this way - is the right direction.
Binary file
Binary file
@@ -0,0 +1,12 @@
1
+ # Pandoc filter to convert all regular text to uppercase.
2
+ # Code, link URLs, etc. are not affected.
3
+ # Adapted from Python example at https://github.com/jgm/pandocfilters/blob/master/examples/caps.py
4
+
5
+ module Rakali::Filters::Caps
6
+
7
+ def caps(key, value, format, meta)
8
+ if key == 'Str'
9
+ value.upcase
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ # Pandoc filter to convert all regular text to uppercase.
2
+ # Code, link URLs, etc. are not affected.
3
+ # Adapted from Python example at https://github.com/jgm/pandocfilters/blob/master/examples/caps.py
4
+
5
+ module Rakali::Filters::Default
6
+
7
+ def default(key, value, format, meta)
8
+ if key == 'Str'
9
+ value.upcase
10
+ end
11
+ end
12
+ end
@@ -7,8 +7,10 @@ module Rakali
7
7
  DEFAULTS = {
8
8
  'from' => { 'format' => 'md' },
9
9
  'to' => { 'folder' => nil, 'format' => 'html' },
10
- 'schema' => 'schemata/default.json',
11
- 'strict' => false
10
+ 'schema' => 'default.json',
11
+ 'citations' => false,
12
+ 'strict' => false,
13
+ 'merge' => false
12
14
  }
13
15
 
14
16
  attr_accessor :config, :documents, :errors
@@ -26,7 +28,14 @@ module Rakali
26
28
  from_folder = @config.fetch('from').fetch('folder')
27
29
  from_format = @config.fetch('from').fetch('format')
28
30
  documents = Dir.glob("#{from_folder}/*.#{from_format}")
29
- documents.each { |document| Rakali::Document.new(document, @config) }
31
+
32
+ # merge all documents into one file if merge flag is set
33
+ # otherwise iterate through each file in source folder
34
+ if @config.fetch('merge')
35
+ Rakali::Document.new(documents, @config)
36
+ else
37
+ documents.each { |document| Rakali::Document.new(document, @config) }
38
+ end
30
39
  rescue KeyError => e
31
40
  Rakali.logger.abort_with "Fatal:", "Configuration #{e.message}."
32
41
  rescue => e
@@ -14,21 +14,32 @@ module Rakali
14
14
  @to_folder = @config.fetch('to').fetch('folder') || @from_folder
15
15
  @to_format = @config.fetch('to').fetch('format')
16
16
 
17
- # for destination filename use source name with new extension
18
- @source = File.basename(document)
19
- @destination = @source.sub(/\.#{@from_format}$/, ".#{@to_format}")
17
+ # if document is a list of files, concatenate into one input
18
+ # use to_folder name as filename
19
+ if document.is_a?(Array)
20
+ @source = document.map { |file| File.basename(file) }.join(" ")
21
+ @destination = "#{File.basename(@from_folder)}.#{@to_format}"
22
+ puts @destination
23
+ else
24
+ # otherwise use source name with new extension for destination filename
25
+ @source = File.basename(document)
26
+ @destination = @source.sub(/\.#{@from_format}$/, ".#{@to_format}")
27
+ end
28
+
29
+ # use citeproc-pandoc if citations flag is set
30
+ bibliography = @config.fetch('citations') ? "-f citeproc-pandoc" : ""
20
31
 
21
32
  # convert source document into JSON version of native AST
22
- @content = convert(nil, @from_folder, "#{@source} -t json")
33
+ @content = convert(nil, @from_folder, "#{@source} #{bibliography}-t json")
23
34
 
24
- # read in JSON schema
25
- @schema = IO.read(@config.fetch('schema'))
35
+ # read in JSON schema, use included schemata folder if no folder is given
36
+ @schema = scheme
26
37
 
27
38
  # validate JSON against schema and report errors
28
39
  @errors = validate
29
40
 
30
41
  # convert to destination document from JSON version of native AST
31
- @output = convert(@content, @to_folder, "-f json -o #{@destination}")
42
+ @output = convert(@content, @to_folder, "-f json #{bibliography}-o #{@destination}")
32
43
  Rakali.logger.abort_with "Fatal:", "Writing file #{@destination} failed" unless created?
33
44
 
34
45
  if @errors.empty?
@@ -62,6 +73,16 @@ module Rakali
62
73
  captured_stdout
63
74
  end
64
75
 
76
+ def scheme
77
+ schema = @config.fetch('schema')
78
+ if schema.include?("/")
79
+ IO.read(schema)
80
+ else
81
+ schemata_folder = File.expand_path("../../../schemata", __FILE__)
82
+ IO.read("#{schemata_folder}/#{schema}")
83
+ end
84
+ end
85
+
65
86
  def validate
66
87
  errors = JSON::Validator.fully_validate(@schema, @content)
67
88
  return [] if errors.empty?
@@ -1,3 +1,3 @@
1
1
  module Rakali
2
- VERSION = "0.0.15"
2
+ VERSION = "0.0.17"
3
3
  end
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/rakali/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'rakali'
6
+ s.version = Rakali::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Martin Fenner"]
9
+ s.email = 'mf@martinfenner.org'
10
+ s.homepage = 'https://github.com/rakali/rakali.rb'
11
+ s.summary = 'A Pandoc command-line wrapper'
12
+ s.description = 'A Pandoc command-line wrapper written in Ruby.'
13
+ s.license = 'MIT'
14
+
15
+ s.required_rubygems_version = ">= 1.3.6"
16
+
17
+ s.add_dependency 'thor', '~> 0.19'
18
+ s.add_dependency 'json-schema', '~> 2.2'
19
+ s.add_dependency 'safe_yaml', "~> 1.0"
20
+ s.add_dependency 'colorator', "~> 0.1"
21
+ s.add_development_dependency 'rake', '~> 0'
22
+ s.add_development_dependency "rspec", '~> 2.6'
23
+ s.add_development_dependency "cucumber", '~> 1.3'
24
+ s.add_development_dependency "aruba", '~> 0'
25
+
26
+ s.files = `git ls-files`.split($/)
27
+ s.executables = ["rakali"]
28
+ end
@@ -0,0 +1,107 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "Citeproc",
4
+ "description": "Included citations using Citeproc and the Citation Style Language (CSL)",
5
+
6
+ "definitions": {
7
+ "references": {
8
+ "type": "object",
9
+ "properties": {
10
+ "c": {
11
+ "type": "array",
12
+ "items": [
13
+ {
14
+ "type": "object",
15
+ "properties": {
16
+ "c": {
17
+ "type": "object",
18
+ "properties": {
19
+ "id": {"type": "object"},
20
+ "type": {"type": "object"},
21
+ "author": {"type": "object"},
22
+ "title": {"type": "object"},
23
+ "container-title": {"type": "object"},
24
+ "issued": { "$ref": "#/definitions/date-parts" },
25
+ "page": {"type": "object"},
26
+ "volume": {"type": "object"}
27
+ },
28
+ "required": ["id","issued"]
29
+ }
30
+ }
31
+ }
32
+ ]
33
+ }
34
+ }
35
+ },
36
+ "date-parts": {
37
+ "type": "object",
38
+ "properties": {
39
+ "c": {
40
+ "type": "object",
41
+ "properties": {
42
+ "year": {"type": "object"},
43
+ "month": {"type": "object"},
44
+ "day": {"type": "object"}
45
+ },
46
+ "required": ["year"]
47
+ }
48
+ }
49
+ }
50
+ },
51
+
52
+ "type": "array",
53
+ "items": [
54
+ {
55
+ "type": "object",
56
+ "properties": {
57
+ "unMeta": {
58
+ "type": "object",
59
+ "properties": {
60
+ "bibliography": {"type": "object"},
61
+ "references": { "$ref": "#/definitions/references" },
62
+ "csl": {"type": "object"},
63
+ "citation-abbreviations": {"type": "object"}
64
+ },
65
+ "anyOf": [
66
+ {
67
+ "bibliography": {"type": "object"},
68
+ "required": ["bibliography"]
69
+ },
70
+ {
71
+ "references": { "$ref": "#/definitions/references" },
72
+ "required": ["references"]
73
+ }
74
+ ]
75
+ }
76
+ }
77
+ },
78
+ {
79
+ "type": "array",
80
+ "items": [
81
+ {
82
+ "type": "object",
83
+ "properties": {
84
+ "t": {"enum": ["Div","Header"]},
85
+ "c": {
86
+ "type": "array",
87
+ "items": [
88
+ {
89
+ "type": "number"
90
+ },
91
+ {
92
+ "type": "array",
93
+ "items": [
94
+ {
95
+ "type": "string",
96
+ "enum": ["abstract","references"]
97
+ }
98
+ ]
99
+ }
100
+ ]
101
+ }
102
+ }
103
+ }
104
+ ]
105
+ }
106
+ ]
107
+ }
@@ -0,0 +1,19 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "Default",
4
+ "description": "The default Pandoc schema",
5
+ "type": "array",
6
+ "items": [
7
+ {
8
+ "type": "object",
9
+ "properties": {
10
+ "unMeta": {
11
+ "type": "object"
12
+ }
13
+ }
14
+ },
15
+ {
16
+ "type": "array"
17
+ }
18
+ ]
19
+ }
@@ -0,0 +1,72 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "JATS",
4
+ "description": "Journal Article Tagging Suite (JATS)",
5
+
6
+ "definitions": {
7
+ "authors": {
8
+ "type": "object"
9
+ },
10
+ "article": {
11
+ "type": "object",
12
+ "properties": {
13
+ "c": {
14
+ "type": "object",
15
+ "properties": {
16
+ "doi": {"type": "object"}
17
+ },
18
+ "required": ["doi"]
19
+ }
20
+ }
21
+ },
22
+ "journal": {
23
+ "type": "object",
24
+ "properties": {
25
+ "c": {
26
+ "type": "object",
27
+ "properties": {
28
+ "title": {"type": "object"},
29
+ "eissn": {"type": "object"},
30
+ "publisher-id": {"type": "object"}
31
+ },
32
+ "required": ["title","eissn","publisher-id"]
33
+ }
34
+ }
35
+ },
36
+ "publisher": {
37
+ "type": "object",
38
+ "properties": {
39
+ "c": {
40
+ "type": "object",
41
+ "properties": {
42
+ "name": {"type": "object"}
43
+ },
44
+ "required": ["name"]
45
+ }
46
+ }
47
+ }
48
+ },
49
+
50
+ "type": "array",
51
+ "items": [
52
+ {
53
+ "type": "object",
54
+ "properties": {
55
+ "unMeta": {
56
+ "type": "object",
57
+ "properties": {
58
+ "title": {"type": "object"},
59
+ "authors": { "$ref": "#/definitions/authors" },
60
+ "article": { "$ref": "#/definitions/article" },
61
+ "journal": { "$ref": "#/definitions/journal" },
62
+ "publisher": { "$ref": "#/definitions/publisher" }
63
+ },
64
+ "required": ["title","authors","article","journal","publisher"]
65
+ }
66
+ }
67
+ },
68
+ {
69
+ "type": "array"
70
+ }
71
+ ]
72
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "Jekyll",
4
+ "description": "Jekyll static site generator",
5
+
6
+ "definitions": {
7
+ "tags": {
8
+ "type": "object",
9
+ "properties": {
10
+ "c": {"type": "array"}
11
+ }
12
+ }
13
+ },
14
+
15
+ "type": "array",
16
+ "items": [
17
+ {
18
+ "type": "object",
19
+ "properties": {
20
+ "unMeta": {
21
+ "type": "object",
22
+ "properties": {
23
+ "title": {"type": "object"},
24
+ "layout": {"type": "object"},
25
+ "tags": { "$ref": "#/definitions/tags" }
26
+ },
27
+ "required": ["title","layout"]
28
+ }
29
+ }
30
+ },
31
+ {
32
+ "type": "array"
33
+ }
34
+ ]
35
+ }
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Rakali::Converter do
4
+ describe "config" do
5
+ it "should read the default config" do
6
+ config = Rakali::Converter::DEFAULTS
7
+ config.fetch('from').fetch('format').should eq('md')
8
+ end
9
+
10
+ it "should read the config via file" do
11
+ file = fixture_path + 'docx.yml'
12
+ subject = Rakali::Converter.new(file)
13
+ subject.config.fetch('from').fetch('folder').should eq('minimal')
14
+ subject.config.fetch('from').fetch('format').should eq('docx')
15
+ end
16
+
17
+ it "should merge default format" do
18
+ file = fixture_path + 'only_folder_key.yml'
19
+ subject = Rakali::Converter.new(file)
20
+ subject.config.fetch('from').fetch('folder').should eq('minimal')
21
+ subject.config.fetch('from').fetch('format').should eq('md')
22
+ end
23
+
24
+ it "should raise an error when the config file doesn't exist" do
25
+ file = fixture_path + 'x'
26
+ lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
27
+ end
28
+
29
+ it "should raise an error when the config file is empty" do
30
+ file = fixture_path + 'empty.yml'
31
+ lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
32
+ end
33
+
34
+ it "should raise an error when the \"from\" key config doesn't exist" do
35
+ file = fixture_path + 'no_from_key.yml'
36
+ lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,63 @@
1
+ require 'spec_helper'
2
+
3
+ describe Rakali::Document do
4
+ describe "convert" do
5
+ it "should convert minimal input" do
6
+ document = fixture_path + 'minimal.md'
7
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
8
+ { 'from' => { 'folder' => fixture_path }, 'to' => { 'format' => 'docx' } })
9
+ subject = Rakali::Document.new(document, config)
10
+ subject.valid?.should be_truthy
11
+ subject.errors.should be_empty
12
+ subject.created?.should be_truthy
13
+ end
14
+
15
+ it "should convert multiple files" do
16
+ documents = Dir.glob("#{fixture_path}/*.md")
17
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
18
+ { 'from' => { 'folder' => fixture_path }, 'to' => { 'format' => 'epub' }, 'merge' => true })
19
+ subject = Rakali::Document.new(documents, config)
20
+ subject.valid?.should be_truthy
21
+ subject.errors.should be_empty
22
+ subject.created?.should be_truthy
23
+ end
24
+ end
25
+
26
+ describe "validate" do
27
+ it "should validate with empty input" do
28
+ document = fixture_path + 'empty.md'
29
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
30
+ { 'from' => { 'folder' => fixture_path } })
31
+ subject = Rakali::Document.new(document, config)
32
+ subject.valid?.should be_truthy
33
+ subject.errors.should be_empty
34
+ end
35
+
36
+ it "should not validate with empty input and extended schema" do
37
+ document = fixture_path + 'empty.md'
38
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
39
+ { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json' })
40
+ subject = Rakali::Document.new(document, config)
41
+ subject.valid?.should be_falsey
42
+ subject.errors.length.should == 2
43
+ subject.errors.first.should match("The property '#/0/unMeta' did not contain a required property of 'title'")
44
+ subject.errors.last.should match("The property '#/0/unMeta' did not contain a required property of 'layout'")
45
+ end
46
+
47
+ it "should not validate with empty input and extended schema and raise error" do
48
+ document = fixture_path + 'empty.md'
49
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
50
+ { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json', 'strict' => true })
51
+ lambda { Rakali::Document.new(document, config) }.should raise_error SystemExit
52
+ end
53
+
54
+ it "should validate with extended input and extended schema" do
55
+ document = fixture_path + 'jekyll.md'
56
+ config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
57
+ { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json' })
58
+ subject = Rakali::Document.new(document, config)
59
+ subject.valid?.should be_truthy
60
+ subject.errors.should be_empty
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,3 @@
1
+ from:
2
+ folder: minimal
3
+ format: docx
File without changes
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,4 @@
1
+ from:
2
+ folder: fixtures/minimal
3
+ format: md
4
+ schema: jekyll.json
@@ -0,0 +1,8 @@
1
+ ---
2
+ layout: post
3
+ title: "Nine simple ways to make it easier to (re)use your data"
4
+ ---
5
+
6
+ # Title
7
+
8
+ This is a **test**.
@@ -0,0 +1,7 @@
1
+ ## Merge Title
2
+
3
+ This is another document and has a list.
4
+
5
+ * one
6
+ * two
7
+ * three
@@ -0,0 +1,3 @@
1
+ # title
2
+
3
+ This is a **test**.
@@ -0,0 +1 @@
1
+ schema: default.json
@@ -0,0 +1,2 @@
1
+ from:
2
+ folder: minimal
@@ -0,0 +1,101 @@
1
+ require 'spec_helper'
2
+
3
+ describe Rakali::Logger do
4
+ let(:topic) { "Topic:" }
5
+ let(:message) { "This is the message." }
6
+ let(:output) { " #{topic} #{message}" }
7
+
8
+ describe "debug" do
9
+ subject { Rakali::Logger.new(Rakali::Logger::DEBUG) }
10
+
11
+ it "initialize" do
12
+ subject.log_level.should == 0
13
+ end
14
+
15
+ it "debug" do
16
+ capture_stdout { subject.debug topic, message }.should start_with(output)
17
+ end
18
+
19
+ it "info" do
20
+ capture_stdout { subject.info topic, message }.should start_with(output)
21
+ end
22
+
23
+ it "warn" do
24
+ capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
25
+ end
26
+
27
+ it "error" do
28
+ capture_stderr { subject.error topic, message }.should start_with(output.red)
29
+ end
30
+ end
31
+
32
+ describe "info" do
33
+ it "initialize" do
34
+ subject.log_level.should == 1
35
+ end
36
+
37
+ it "debug" do
38
+ capture_stdout { subject.debug topic, message }.should eq("")
39
+ end
40
+
41
+ it "info" do
42
+ capture_stdout { subject.info topic, message }.should start_with(output)
43
+ end
44
+
45
+ it "warn" do
46
+ capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
47
+ end
48
+
49
+ it "error" do
50
+ capture_stderr { subject.error topic, message }.should start_with(output.red)
51
+ end
52
+ end
53
+
54
+ describe "warn" do
55
+ subject { Rakali::Logger.new(Rakali::Logger::WARN) }
56
+
57
+ it "initialize" do
58
+ subject.log_level.should == 2
59
+ end
60
+
61
+ it "debug" do
62
+ capture_stdout { subject.debug topic, message }.should eq("")
63
+ end
64
+
65
+ it "info" do
66
+ capture_stdout { subject.info topic, message }.should eq("")
67
+ end
68
+
69
+ it "warn" do
70
+ capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
71
+ end
72
+
73
+ it "error" do
74
+ capture_stderr { subject.error topic, message }.should start_with(output.red)
75
+ end
76
+ end
77
+
78
+ describe "error" do
79
+ subject { Rakali::Logger.new(Rakali::Logger::ERROR) }
80
+
81
+ it "initialize" do
82
+ subject.log_level.should == 3
83
+ end
84
+
85
+ it "debug" do
86
+ capture_stdout { subject.debug topic, message }.should eq("")
87
+ end
88
+
89
+ it "info" do
90
+ capture_stdout { subject.info topic, message }.should eq("")
91
+ end
92
+
93
+ it "warn" do
94
+ capture_stderr { subject.warn topic, message }.should eq("")
95
+ end
96
+
97
+ it "error" do
98
+ capture_stderr { subject.error topic, message }.should start_with(output.red)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,62 @@
1
+ require 'bundler/setup'
2
+ Bundler.setup
3
+
4
+ require 'rakali'
5
+
6
+ RSpec.configure do |config|
7
+ config.before do
8
+ ARGV.replace []
9
+ end
10
+
11
+ def fixture_path
12
+ File.expand_path("../fixtures", __FILE__) + '/'
13
+ end
14
+
15
+ # This code was adapted from Thor, available under MIT-LICENSE
16
+ # Copyright (c) 2008 Yehuda Katz, Eric Hodel, et al.
17
+ def capture(stream)
18
+ begin
19
+ stream = stream.to_s
20
+ eval "$#{stream} = StringIO.new"
21
+ yield
22
+ result = eval("$#{stream}").string
23
+ ensure
24
+ eval("$#{stream} = #{stream.upcase}")
25
+ end
26
+
27
+ result
28
+ end
29
+
30
+ def capture_stdout(&block)
31
+ original_stdout = $stdout
32
+ $stdout = fake = StringIO.new
33
+ begin
34
+ yield
35
+ ensure
36
+ $stdout = original_stdout
37
+ end
38
+ fake.string
39
+ end
40
+
41
+ def capture_stderr(&block)
42
+ original_stderr = $stderr
43
+ $stderr = fake = StringIO.new
44
+ begin
45
+ yield
46
+ ensure
47
+ $stderr = original_stderr
48
+ end
49
+ fake.string
50
+ end
51
+
52
+ # This code was adapted from Ruby on Rails, available under MIT-LICENSE
53
+ # Copyright (c) 2004-2013 David Heinemeier Hansson
54
+ def silence_warnings
55
+ old_verbose, $VERBOSE = $VERBOSE, nil
56
+ yield
57
+ ensure
58
+ $VERBOSE = old_verbose
59
+ end
60
+
61
+ alias silence capture
62
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rakali
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Fenner
@@ -129,7 +129,23 @@ executables:
129
129
  extensions: []
130
130
  extra_rdoc_files: []
131
131
  files:
132
+ - ".gitignore"
133
+ - ".rakali.yml"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - Gemfile.lock
137
+ - LICENSE
138
+ - README.md
139
+ - Rakefile
132
140
  - bin/rakali
141
+ - examples/2013-11-17-the-grammar-of-scholarly-communication.md
142
+ - examples/2013-12-12-from-markdown-to-jats-xml-in-one-step.md
143
+ - examples/2014-07-24-dont-reinvent-the-wheel.md
144
+ - examples/2014-08-06-what-is-doi.md
145
+ - examples/fenner_2011.docx
146
+ - examples/fenner_2013.docx
147
+ - filters/caps.rb
148
+ - filters/default.rb
133
149
  - lib/rakali.rb
134
150
  - lib/rakali/cli.rb
135
151
  - lib/rakali/converter.rb
@@ -137,6 +153,24 @@ files:
137
153
  - lib/rakali/logger.rb
138
154
  - lib/rakali/utils.rb
139
155
  - lib/rakali/version.rb
156
+ - rakali.gemspec
157
+ - schemata/citeproc.json
158
+ - schemata/default.json
159
+ - schemata/jats.json
160
+ - schemata/jekyll.json
161
+ - spec/converter_spec.rb
162
+ - spec/document_spec.rb
163
+ - spec/fixtures/docx.yml
164
+ - spec/fixtures/empty.md
165
+ - spec/fixtures/empty.yml
166
+ - spec/fixtures/incomplete.yml
167
+ - spec/fixtures/jekyll.md
168
+ - spec/fixtures/merge.md
169
+ - spec/fixtures/minimal.md
170
+ - spec/fixtures/no_from_key.yml
171
+ - spec/fixtures/only_folder_key.yml
172
+ - spec/logger_spec.rb
173
+ - spec/spec_helper.rb
140
174
  homepage: https://github.com/rakali/rakali.rb
141
175
  licenses:
142
176
  - MIT