RubyGems - rakali - Versions diffs - 0.0.15 → 0.0.17 - Mend

rakali 0.0.15 → 0.0.17

Files changed (39) hide show

checksums.yaml +4 -4
data/.gitignore +8 -0
data/.rakali.yml +6 -0
data/.travis.yml +22 -0
data/Gemfile +3 -0
data/Gemfile.lock +54 -0
data/LICENSE +21 -0
data/README.md +6 -0
data/Rakefile +10 -0
data/examples/2013-11-17-the-grammar-of-scholarly-communication.md +54 -0
data/examples/2013-12-12-from-markdown-to-jats-xml-in-one-step.md +24 -0
data/examples/2014-07-24-dont-reinvent-the-wheel.md +55 -0
data/examples/2014-08-06-what-is-doi.md +88 -0
data/examples/fenner_2011.docx +0 -0
data/examples/fenner_2013.docx +0 -0
data/filters/caps.rb +12 -0
data/filters/default.rb +12 -0
data/lib/rakali/converter.rb +12 -3
data/lib/rakali/document.rb +28 -7
data/lib/rakali/version.rb +1 -1
data/rakali.gemspec +28 -0
data/schemata/citeproc.json +107 -0
data/schemata/default.json +19 -0
data/schemata/jats.json +72 -0
data/schemata/jekyll.json +35 -0
data/spec/converter_spec.rb +39 -0
data/spec/document_spec.rb +63 -0
data/spec/fixtures/docx.yml +3 -0
data/spec/fixtures/empty.md +0 -0
data/spec/fixtures/empty.yml +1 -0
data/spec/fixtures/incomplete.yml +4 -0
data/spec/fixtures/jekyll.md +8 -0
data/spec/fixtures/merge.md +7 -0
data/spec/fixtures/minimal.md +3 -0
data/spec/fixtures/no_from_key.yml +1 -0
data/spec/fixtures/only_folder_key.yml +2 -0
data/spec/logger_spec.rb +101 -0
data/spec/spec_helper.rb +62 -0
metadata +35 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cecffbfec2462ddcd3859923599fa16680ea7331
-  data.tar.gz: c09b0a2add3793a8f02b3ec88a7413613c9bbdf6
+  metadata.gz: 2eb9ce467fe5995f4ccdf923e5fadc8565fa92cc
+  data.tar.gz: 591d43f84a478ff6aee2640e0d20b6e8495b6109
 SHA512:
-  metadata.gz: 1768d9530d2a2de035237548852399c4b4bc1c1ceb12d0f01ec94ab43ec08096449306199f5ecd2d794053d07c790ac31bdf5f2e45769b4dc07d2930efed0b88
-  data.tar.gz: bee23522cb306f598561cfd96ffb5a5c14027d3d42c0483df75d29d93210498069fa76a37c2c6fa6e56f916034cd368d4ce8960ae27c361f363bde0cbcb24307
+  metadata.gz: 612720c0b05c2890eca185903166f34272fa0225db753aac244b970a32852661275a86500180563ff866fa83d543c7ce45029c3aaf9ed01ae72749d25917ceb8
+  data.tar.gz: 57724aa15bed4025549c45d274d3298e9552ada75fe62fef0b783bdde5fd1b8a9fabd0e8edd083d51bb90b25cf63427a3f54d9663f9765911fae0c91bea544f4

data/.gitignore ADDED

@@ -0,0 +1,8 @@
+pkg/*
+*.gem
+.bundle
+*.html
+*.epub
+*.jats
+spec/fixtures/*.html
+spec/fixtures/*.docx

data/.rakali.yml ADDED

@@ -0,0 +1,6 @@
+from:
+  folder: examples
+  format: md
+to:
+  format: epub
+merge: true

data/.travis.yml ADDED

@@ -0,0 +1,22 @@
+language: haskell
+rvm:
+  - 1.9.3
+  - 2.1.2
+install:
+  - travis_retry cabal install pandoc pandoc-citeproc
+  - travis_retry bundle install
+script:
+  - bundle exec rake
+  - bundle exec rakali convert .rakali.yml
+deploy:
+  provider: rubygems
+  api_key:
+    secure: Gcr3lbeTuQW0MXpO9sh2lnYN4EY9FFLaBhF9RK99JjGUYAm7HR70yWM/EhAThWtfjVhzwcjzOJ6RrGE401zVRUsye8GTMXA5d7USx1KGGmWNRG5cYTCBWhymHXv4vfSfMp3CP2FgyfcTigXu8yHh18ONpHIhbBuZNx1DliBEPgU=
+  gem: rakali
+  on:
+    tags: true
+    repo: rakali/rakali.rb

data/Gemfile ADDED

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,54 @@
+PATH
+  remote: .
+  specs:
+    rakali (0.0.17)
+      colorator (~> 0.1)
+      json-schema (~> 2.2)
+      safe_yaml (~> 1.0)
+      thor (~> 0.19)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    aruba (0.6.0)
+      childprocess (>= 0.3.6)
+      cucumber (>= 1.1.1)
+      rspec-expectations (>= 2.7.0)
+    builder (3.2.2)
+    childprocess (0.5.3)
+      ffi (~> 1.0, >= 1.0.11)
+    colorator (0.1)
+    cucumber (1.3.16)
+      builder (>= 2.1.2)
+      diff-lcs (>= 1.1.3)
+      gherkin (~> 2.12)
+      multi_json (>= 1.7.5, < 2.0)
+      multi_test (>= 0.1.1)
+    diff-lcs (1.2.5)
+    ffi (1.9.3)
+    gherkin (2.12.2)
+      multi_json (~> 1.3)
+    json-schema (2.2.4)
+    multi_json (1.10.1)
+    multi_test (0.1.1)
+    rake (0.9.6)
+    rspec (2.99.0)
+      rspec-core (~> 2.99.0)
+      rspec-expectations (~> 2.99.0)
+      rspec-mocks (~> 2.99.0)
+    rspec-core (2.99.1)
+    rspec-expectations (2.99.2)
+      diff-lcs (>= 1.1.3, < 2.0)
+    rspec-mocks (2.99.2)
+    safe_yaml (1.0.3)
+    thor (0.19.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  aruba (~> 0)
+  cucumber (~> 1.3)
+  rakali!
+  rake (~> 0)
+  rspec (~> 2.6)

data/LICENSE ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2014 Rakali
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,6 @@
+rakali.rb
+=========
+[![Build Status](https://travis-ci.org/rakali/rakali.rb.svg)](https://travis-ci.org/rakali/rakali.rb)
+[![Gem Version](https://badge.fury.io/rb/rakali.svg)](http://badge.fury.io/rb/rakali)
+[![Code Climate](https://codeclimate.com/github/rakali/rakali.rb.png)](https://codeclimate.com/github/rakali/rakali.rb)

data/Rakefile ADDED

@@ -0,0 +1,10 @@
+require 'bundler'
+require 'rake'
+require 'yaml'
+require 'rspec/core/rake_task'
+Bundler::GemHelper.install_tasks
+RSpec::Core::RakeTask.new('spec')
+# default task is running rspec tests
+task :default => :spec

data/examples/2013-11-17-the-grammar-of-scholarly-communication.md ADDED

@@ -0,0 +1,54 @@
+---
+layout: post
+title: "The Grammar of Scholarly Communication"
+tags: [markdown, authoring]
+---
+Authoring of scholarly articles is a recurring theme in this blog since it started in 2008. Authoring is still in desperate need for improvement, and nobody has convincingly figured out how to solve this problem.<!--more--> Authoring involves several steps, and it helps to think about them separately:
+* **Writing**. Manuscript writing, including formatting, collaborative authoring
+* **Submission**. Formatting a manuscript according to a publisher's author guidelines, and handing it over to a publishing platform
+* **Revision**. Changes made to a manuscript in the peer review process, or after publication
+Although authoring typically involves text, similar issues arise for other research outputs, e.g research data. And these considerations are also relevant for other forms of publishing, whether it is self-publication on a blog or website, or publishing of preprints and white papers.
+![Flickr photo by [citnaj](http://www.flickr.com/photos/citnaj/1278021067/).](/images/grammar.jpg)
+For me the main challenge in authoring is to go from human-readable unstructured content to highly structured machine-readable content. We could make authoring simpler by either forgoing any structure and just publishing in any format we want, or we can force authors to structure their manuscripts according to a very specific set of rules. The former doesn't seem to be an option, not only do we have a set of community standards that have evolved for a very long time (research articles for example have title, authors, results, references, etc.), but it also makes it hard to find and reuse scholarly research by others.
+The latter option is also not really viable since most researchers haven't learned to produce their research outputs in machine-readable highly standardized formats. There are some exceptions, e.g. [CONSORT](http://www.consort-statement.org/) and other reporting standards in clinical medicine or the [semantic publishing in Crystallography](http://blogs.ch.cam.ac.uk/pmr/2012/01/23/brian-mcmahon-publishing-semantic-crystallography-every-science-data-publisher-should-watch-this-all-the-way-through/), but for the most part research outputs are to diverse to easily find a format that works for all of them. The current trend is certainly towards machine-readable rather than towards human-readable, but there is still a significant gap - scholarly articles are transformed from documents in Microsoft Word (or sometimes LaTeX) format into XML (for most biomedical research that means [JATS](http://jats.nlm.nih.gov/publishing/)) using kludgy tools and lots of manual labor.
+What solutions have been tried to overcome the limitations of our current authoring tools, and to make the process more enjoyable for authors and more productive for publishers?
+1. Do the conversion manually, still a common workflow.
+2. Tools for publishers such as [eXtyles](http://blogs.plos.org/mfenner/2009/05/01/extyles_interview_with_elizabeth_blake_and_bruce_rosenblum/), [Merops](http://www.shabash.net/merops/) - both commercial - or the evolving Open Source [mPach](http://www.lib.umich.edu/mpach/modules) that convert Microsoft Word documents into JATS XML and do a lot of automated checks along the way.
+3. Tools for authors that directly generate JATS XML, either as a Microsoft Word plugin (the [Article Authoring Add-In](http://blogs.nature.com/mfenner/2008/11/07/interview-with-pablo-fernicola), not actively maintained) in the browser (e.g. [Lemon8-XML](http://blogs.plos.org/mfenner/2009/02/27/lemon8_xml_interview_with_mj_suhonos/), not actively maintained), or directly in a publishing platform such as Wordpress ([Annotum](http://annotum.org/)).
+4. Forget about XML and use HTML5 has the canocical file format, e.g. as [Scholarly HTML](http://blogs.plos.org/mfenner/2011/03/19/a-very-brief-history-of-scholarly-html/) or HTML5 specifications such as [HTMLBook](https://github.com/oreillymedia/HTMLBook/blob/master/specification.asciidoc). Please read Molly Sharp's [blog post](http://blogs.plos.org/tech/structured-documents-for-science-jats-xml-as-canonical-content-format/) for background information about HTML as an alternative to XML.
+5. Use file formats for authoring that are a better fit for the requirements of scholarly authors, in particular [Scholarly Markdown](http://blog.martinfenner.org/2012/12/13/a-call-for-scholarly-markdown/).
+6. Build online editors for scientific content that hide the underlying file format, and guide users towards a structured format, e.g. by not allowing input that doesn't conform to specifications.
+**Solution 1.** isn't really an option, as it makes scholarly publishing unnecessarily slow and expensive. Typesetter Kaveh Bazergan has gone on record at the [SpotOn London Conference 2012](http://www.nature.com/spoton/2012/11/spoton-london-2012-a-global-conference/) by saying that the current process is insane and that he wants to be "put out of business".
+**Solution 2.** is probably the most commonly used workflow used by larger publishers today, but is very much centered around a Microsoft Word to XML workflow. LaTeX is a popular authoring environment in some disciplines, but still requires work to convert documents into web-friendly formats such as HTML and XML.
+**Solutions 3. to 5.** have never picked up any significant traction. Overall the progress in this area has been modest at best, and the mainstream of authoring today isn't too different from 20 years ago. Although I have gone on record for saying that [Scholarly Markdown](/tags.html#markdown-ref) has a lot of potential, the problem is much bigger than finding a single file format, and markdown will never be the solution for all authoring needs.
+**Solution 6.** is an area where a lot of exciting development is currently happening, examples include [Authorea](https://www.authorea.com/), [WriteLateX](https://www.writelatex.com/), [ShareLaTeX](https://www.sharelatex.com/). Although the future of scholarly authoring will certainly include online authoring tools (making it much easier to collaborate, one of the authoring pain points), we run the risk of locking in users into one particular authoring environment.
+### Going Forward
+How can we move forward? I would suggest the following:
+1. Publishers should accept manuscripts in any reasonable file format, which means at least Microsoft Word, Open Office, LaTeX, Markdown, HTML and PDF, but possibly more. This will create a lot of extra work for publishers, but will open the doors for innovation, both in the academic and commercial sector. We will never see significant progress in scholarly authoring tools if the submission step requires manuscripts to be in a single file format (Microsoft Word) - in particular since this file format is a general purpose word processsing format and not something designed specifically for scholarly content. And we want researchers to spend their time doing research and writing up their research, not formatting documents.
+2. To handle this avalanche of unstructured documents, publishers need conversion tools that can transform all these documents into a format that can feed into their editorial and publishing workflows. A limited number of these tools exist already, but this will require a significant development effort. Again, opening up submissions to a variety of file formats will not only foster innovation in authoring tools, but also in document conversion tools.
+3. We should think beyond XML. Many of the workflows designed today center around conversions from one XML format to another, e.g. Microsoft Word to JATS or [TEI](http://www.tei-c.org/index.xml) (popular in the humanities), often using XLST transforms. Not only is XML difficult for humans to read or edit, but the web and many of the technologies built around it are moving away from XML towards HTML5 and JSON. XML is fine as an important output format for publishing, but maybe not the best format to hold everything together.
+4. As we haven't come up with a canoical file format for scholarly documents by now, we should give up that idea. XML is great for publisher workflows, but is not something humans can easily edit or read. PDF is still the most widely read format by humans, but is not a good intermediary format. LaTeX is too complex for authors outside of mathematics, physics and related fields, and is not built with web standards in mind. Markdown is promising, but doesn't easily support highly structured content. And HTML5 and the related ePub are widely popular, but can be hard to edit without a visual editor, and currently don't include enough standard metadata to support scholarly content out of the box.
+5. The focus should not be on canonical file formats for scholarly documents, but on tools that understand the manuscripts created by researchers and can transform them into something more structured. As we have learned from document conversion tools such as [Pandoc](http://johnmacfarlane.net/pandoc/), we can't do this with a simple find and replace using regular expressions, but need a more structured approach. Pandoc is taking the input document (markdown, LaTeX or HTML) apart and is constructing an abstract syntax tree ([AST](http://en.wikipedia.org/wiki/Abstract_syntax_tree)) of the document, using parsing expression grammar ([PEG](http://en.wikipedia.org/wiki/Parsing_expression_grammar)), which includes a set of parsing rules. Parsing expression grammars are fairly new, [first described by Bryan Ford](http://bford.info/pub/lang/peg) about 10 years ago, but in my mind are a very good fit for the formal grammar of scientific documents. It should be fairly straightforward to generate a variety of output formats from the AST (Pandoc can convert into more than 30 document formats), the hard part is the parsing of the input.
+All this requires a lot of work. Pandoc is a good model to start, but is written in Haskell, a functional programming language that not many people are familar with. For small changes Pandoc allows you to directly manipulate the AST (represented as JSON) using [filters](http://johnmacfarlane.net/pandoc/scripting.html) written in Haskell or Python. And [custom writers](https://github.com/jgm/pandoc) for other document formats can be written using [Lua](http://www.lua.org/), another interesting programming language that not many people know about. Lua is a fast and relatively easy to learn scripting language that can be easily embedded into other languages, and for similar reasons is also used to [extend the functionality of Wikipedia](http://en.wikipedia.org/wiki/Wikipedia:Lua). PEG parsers in other languages include [Treetop](http://treetop.rubyforge.org/) (Ruby), [PEG.js](http://pegjs.majda.cz/) (Javascript), and [ANTLR](http://www.antlr.org/), a popular parser generator that also includes PEG features.
+But I think the effort to build a solid open source conversion tool for scholarly documents is worth it, in particular for smaller publishers and publishing platforms who can't afford the commercial Microsoft Word to JATS conversion tools. We shouldn't take any shortcuts - e.g. by focussing on XML and XLST transforms - and we can improve this tool over time, e.g. by starting with a few input and output formats. This tool will be valuable beyond authoring, as it can also be very helpful to convert published scholarly content into other formats such as ePub, and in text mining, which in many ways tries to solve many of the same problems. The [Pandoc documentation](http://johnmacfarlane.net/pandoc/scripting.html) includes an example of extracting all URLs out of a document, and this can be modified to extract other content. In case you wonder whether I gave up on the idea of [Scholarly Markdown](/tags.html#markdown-ref) - not at all. To me this is a logical next step, opening up journal submission systems to Scholarly Markdown and other evolving file formats. And Pandoc, one of the most interesting tools in this space, is a markdown conversion tool at its heart. The next steps could be the following:
+* write a custom writer in Lua that generates JATS output from Pandoc
+* explore how difficult it would be to add Microsoft Word .docx as Pandoc input format
+* develop Pandoc filters relevant for scholarly documents (e.g. [auto-linking accession numbers of biomedical databases](/2013/07/02/auto-generating-links-to-data-and-resources/))

data/examples/2013-12-12-from-markdown-to-jats-xml-in-one-step.md ADDED

@@ -0,0 +1,24 @@
+---
+layout: post
+title: "From Markdown to JATS XML in one Step"
+tags: [markdown, jats, pandoc]
+---
+The Journal Article Tag Suite ([JATS](http://jats.nlm.nih.gov/)) is a NISO standard that defines a set of XML elements and attributes for tagging journal articles. JATS is not only used for fulltext content at PubMed Central (and JATS has evolved from the NLM Archiving and Interchange Tag Suite originally developed for PubMed Central), but is also increasinly used by publishers.<!--more-->
+For many publishers the *version of record* of an article is stored in XML, and other formats (currently HTML, PDF and increasingly ePub) are generated from this XML. Unfortunately the process of converting author-submitted manuscripts into JATS-compliant XML is time-consuming and costly, and this is a problem in particular for small publishers.
+In a recent blog post ([The Grammar of Scholarly Communication](/2013/11/17/the-grammar-of-scholarly-communication/)) I argued that publishers should accept manuscripts in any reasonable file format, including Microsoft Word, Open Office, LaTeX, Markdown, HTML and PDF. Readers of this blog know that I am a big fan of [markdown](/tags.html#markdown-ref) for scholarly documents, but I am of course well aware that at the end of the day these documents have to be converted into JATS.
+As a small step towards that goal I have today released the first public version of [pandoc-jats](https://github.com/mfenner/pandoc-jats), a [custom writer for Pandoc](http://johnmacfarlane.net/pandoc/README.html#custom-writers) that converts markdown documents into JATS XML with a single command, e.g.
+    pandoc -f example.md --filter pandoc-citeproc --bibliography=example.bib --csl=apa.csl -t JATS.lua -o example.xml
+Please see the [pandoc-jats](https://github.com/mfenner/pandoc-jats) Github repository for more detailed information, but using this custom writer is as simple as downloading a single `JATS.lua`file. The big challenge is of course to make this custom writer work with as many documents as possible, and that will be my job the next few weeks. Two example JATS documents are below (both markdown versions of scholarly articles and posted on this blog as HTML):
+* Nine simple ways to make it easier to (re)use your data ([HTML](/2013/06/25/nine-simple-ways-to-make-it-easier-to-reuse-your-data/), [JATS](/files/10.7287.peerj.preprints.7v2.xml))
+* What Can Article Level Metrics Do for You? ([HTML](/2013/12/11/what-can-article-level-metrics-do-for-you/), [JATS](/files/10.1371.journal.pbio.1001687.xml))
+Both JATS files were validated against the JATS DTD and XSD and showed no errors with the NLM XML StyleChecker - using the excellent [jats-conversion](https://github.com/PeerJ/jats-conversion) conversion and validation tools written by Alf Eaton. Markdown is actually a nice file format to convert to XML - in contrast to HTML authors can't for example put closing tags at the wrong places. And a Pandoc custom writer written in the Lua scripting language is an interesting alternative to XSLT transformations, the more common way to create JATS XML. The custom writer has not been tested with other Pandoc input formats besides markdown, of particular interest are of course HTML and LaTeX - Microsoft Word .docx is unfortunately only a Pandoc output format.
+This is the first public release and there is of course a lot of room for improvement. Many elements and attributes are not yet supported - although [ORCID author identifiers](http://orcid.org/blog/2013/03/22/orcid-how-more-specifying-orcid-ids-document-metadata) are of course included. Please help me improve this tool using the Github [Issue Tracker](https://github.com/mfenner/pandoc-jats/issues).

data/examples/2014-07-24-dont-reinvent-the-wheel.md ADDED

@@ -0,0 +1,55 @@
+---
+layout: post
+title: Don't Reinvent the Wheel
+tags: [citeproc, crossref]
+---
+In a [post last week](/2014/07/18/roads-not-stagecoaches/) I talked about roads and stagecoaches, and how work on scholarly infrastructure can often be more important than building customer-facing apps. One important aspect of that infrastruture work is to not duplicate efforts.<!--more-->
+![Image by Cocoabiscuit [on Flickr](http://www.flickr.com/photos/jfgallery/5673321593/)](/images/5673321593_e6a7faa36d_z.jpg)
+A good example is information (or metadata) about scholarly publications. I am the technical lead for the open source [article-level metrics (ALM) software](http://articlemetrics.github.io/). This software can be used in different ways, but most people use it for tracking the metrics of scholarly articles, with articles that have DOIs issued by CrossRef. The ALM software needs three pieces of information for every article: **DOI**, **publication date**, and **title**. This information can be entered via a web interface, but that is of course not very practical for adding dozens or hundreds of articles at a time. The ALM software has therefore long supported the import of multiple articles via a text file and the command line.
+This approach is working fine for the ALM software [running at PLOS since 2009](http://articlemetrics.github.io/plos/), but is for example a problem if the ALM software runs as a service for multiple publishers. A more flexible approach is to provide an API to upload articles, and I've [added an API](http://articlemetrics.github.io/docs/api/) for creating, updating and deleting articles in January 2014.
+While the API is an improvement, it still requires the integration into a number of possibly very different publisher workflows, and you have to deal with setting up the permissions, e.g. so that publisher A can't delete an article from publisher B.
+The next ALM release (3.3) will therefore add a third approach to importing articles: using the [CrossRef API](http://api.crossref.org) to look up article information. Article-level metrics is about tracking already published works, so we really only care about articles that have DOIs registered with CrossRef and are therefore published. ALM is now talking to a single API, and this makes it much easier to do this for a number of publishers without writing custom code. Since ALM is an open source application already used by several publishers that aspect is important. And because we are importing, we have don't have to worry about permissions. The only requirement is that CrossRef has the correct article information, and has this information as soon as possible after publication.
+At this point I have a confession to make: I regularly use other CrossRef APIs, but wasn't aware of **api.crossref.org** until fairly recently. That is sort of understandable since the reference platform was deployed only September last year. The documentation to get you started is on [Github](https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md) and the version history shows frequent API updates (now at v22). The API will return all kinds of information, e.g.
+* how many articles has publisher x published in 2012
+* percentage of DOIs of publisher Y that include at least one ORCID identifier
+* list all books with a Creative Commons CC-BY license that were published this year
+Funder (via FundRef) information is also included, but is still incomplete. Another interesting result is the number of [component DOIs](http://blogs.plos.org/mfenner/2011/03/26/direct-links-to-figures-and-tables-using-component-dois/) (DOIs for figures, tables or other parts of a document) per year:
+<iframe src="http://cf.datawrapper.de/Ze7et/1/" frameborder="0" allowtransparency="true" allowfullscreen="allowfullscreen" webkitallowfullscreen="webkitallowfullscreen" mozallowfullscreen="mozallowfullscreen" oallowfullscreen="oallowfullscreen" msallowfullscreen="msallowfullscreen" width="640" height="480"></iframe>
+For my specific use case I wanted an API call that returns all articles published by PLOS (or any other publisher) in the last day which I can then run regularly. To get all DOIs from a specific publisher, use their CrossRef member ID - DOI prefixes don't work, as publishers can own more than one DOI prefix. To make this task a little easier I built a CrossRef member search interface into the ALM application:
+![](/images/crossref_api.png)
+We can filter API responses by publication date, but it is a better idea to use the update date, as it is possible that the metadata have changed, e.g. a correction of the title. We also want to increase the number of results per page (using the `rows` parameter). The final API call for all DOIs updated by PLOS since the beginning of the week would be
+```
+http://api.crossref.org/members/340/works?filter=from-update-date:2014-07-21,until-update-date:2014-07-24&rows=1000
+```
+The next step is of course to parse the JSON of the API response, and you will notice that CrossRef is using [Citeproc JSON](http://gsl-nagoya-u.net/http/pub/citeproc-doc.html). This is a standard JSON format for bibliographic information used internally by several reference managers for citation styles, but increasingly also by APIs and other places where you encounter bibliographic information.
+Citeproc JSON is helpful for one particular problem with CrossRef metadata: the exact publication date for an article is not always known, and CrossRef (and similarly DataCite) only requires the publication year. Citeproc JSON can nicely handle partial dates, e.g. year-month:
+```
+issued: {
+  date-parts: [
+    [
+      2014,
+      7
+    ]
+  ]
+},
+```
+I think that a similar approach will work for many other systems that require bibliographic information about scholarly content with CrossRef DOIs. If are not already using **api.crossref.org**, consider integrating with it, I find the API fast, well documented, easy to use - and CrossRef is very responsive to feedback. As you can always wish for more, I would like to see the following: fix the problem were some journal articles are missing the publication date (a required field, even if only the year), and consider adding the canonical URL to the article metadata (which ALM currently has to look up itself, and which is needed to track social media coverage of an article).
+*Update July 24, 2014: added chart with number of component DOIs per year*

data/examples/2014-08-06-what-is-doi.md ADDED

@@ -0,0 +1,88 @@
+---
+layout: post
+title: What is a DOI?
+tags: [doi, wikimania]
+---
+This Sunday [Ian Mulvany](https://twitter.com/ianmulvany) and I will do a presentation on [Open Scholarship Tools](http://wikimania2014.wikimedia.org/wiki/Submissions/Open_Scholarship_Tools_-_a_whirlwind_tour.) at *Wikimania 2014* in London.<!--more--> From the abstract:
+> This presentation will give a broad overview of tools and standards that are helping with Open Scholarship today.
+One of the four broad topics we have picked are *digital object identifiers (DOI)s*. We want to introduce them to people new to them, and we want to show some tricks and cool things to people who already now them. Along the way we will also try to debunk some myths about DOIs.
+### What a DOI looks like
+DOIs - or better DOI names - start with a prefix in the format `10.x` where x is 4-5 digits. The suffix is determined by the organization registering the DOI, and there is no consistent pattern across organizations. The DOI name is typically expressed as a URL (see below). An example DOI would look like: [http://dx.doi.org/10.5555/12345678](http://dx.doi.org/10.5555/12345678). Something in the format **10/hvx** or [http://doi.org/hvx](http://doi.org/hvx) is a [shortDOI](http://shortdoi.org/), and **1721.1/26698** or [http://hdl.handle.net/1721.1/26698](http://hdl.handle.net/1721.1/26698) is a handle. BTW, all DOIs names are also handles, so [http://hdl.handle.net/10/hvx](http://hdl.handle.net/10/hvx) for the shortDOI example above will resolve correctly.
+### DOIs are persistent identifiers
+Links to resources can change, particularly over long periods of time. Persistent identifiers are needed so that readers can still find the content we reference in a scholarly work (or anything else where persistent linking is important) 10 or 50 years later. There are many kinds of persistent identifiers, one of the key concepts - and a major difference to URLs - is to separate the identifier for the resource from its location. Persistent identifiers require technical infrastructure to resolve identifiers (DOIs use the [Handle System](http://www.handle.net/)) and to allow long-term archiving of resources. DOI registration agencies such as DataCite or CrossRef are required to provide that persistence. Other persistent identifier schemes besides DOIs include [persistent uniform resource locators (PURLs)](http://en.wikipedia.org/wiki/PURL) and [Archival Resource Keys (ARKs)](http://en.wikipedia.org/wiki/Archival_Resource_Key).
+### DOIs have attached metadata
+All DOIs have metadata attached to them. The metadata are supplied by the resource provider, e.g. publisher, and exposed in services run by registration agencies, for example metadata search and content negotiation (see below). There is a minimal set of required metadata for every DOI, but beyond that, different registration agencies will use different metadata schemata, and most metadata are optional. Metadata are important to build centralized discovery services, making it easier to describe a resource, e.g. journal article citing another article. Some of the more recent additions to metadata schemata include persistent identifiers for people ([ORCID](http://orcid.org/)) and funding agencies ([FundRef](http://www.crossref.org/fundref/)), and license information. The following API call will retrieve all publications registered with CrossRef that use a [Creative Commons Attribution license](http://creativecommons.org/licenses/by/3.0/deed.en_US) (and where this information has been provided by the publisher):
+```
+http://api.crossref.org/funders/10.13039/100000001/works?filter=license.url:http://creativecommons.org/licenses/by/3.0/deed.en_US
+```
+### DOIs support link tracking
+Links to other resources are an important part of the metadata, and describing all citations between a large number scholarly documents is a task that can only really be accomplished by a central resource. To solve this very problem DOIs were invented and the CrossRef organization started around 15 years ago.
+### Not every DOI is the same
+The DOI system [originated from an initiative by scholarly publishers](http://www.doi.org/doi_handbook/1_Introduction.html) (first announced at the Frankfurt Book Fair in 1997), with citation linking of journal articles its first application. This citation linking system is managed by [CrossRef](http://www.crossref.org/), a non-profit member organization of scholarly publishers, and [more than half](http://search.crossref.org/help/status) of the about [100 million DOIs](http://www.doi.org/faq.html) that have been assigned to date are managed by them.
+But many DOIs are assigned by one of the other 8 [registration agencies](http://www.doi.org/RA_Coverage.html). You probably know [DataCite](http://www.datacite.org/), but did you know that the [Publications Office of the European Union (OP)](http://publications.europa.eu/index_en.htm) and the [Entertainment Identifier Registry (EIDR)](http://www.eidr.org/) also assign DOIs? The distinction is important, because some of the functionality is a service of the registration agency - metadata search for example is offered by CrossRef ([http://search.crossref.org](http://search.crossref.org)) and DataCite ([http://search.datacite.org](http://search.datacite.org)), but you can't search for a DataCite DOI in the CrossRef metadata search. There is an API to find out the registration agency behind a DOI so that you know what services to expect:
+```
+http://api.crossref.org/works/10.6084/m9.figshare.821213/agency
+{
+  "status": "ok",
+  "message-type": "work-agency",
+  "message-version": "1.0.0",
+  "message": {
+    "DOI": "10.6084/m9.figshare.821213",
+    "agency": {
+      "id": "datacite",
+      "label": "DataCite"
+    }
+  }
+}
+```
+### DOIs are URLs
+[DOI names may be expressed as URLs (URIs) through a HTTP proxy server](http://www.doi.org/faq.html) - e.g. [http://dx.doi.org/10.5555/12345679 ](http://dx.doi.org/10.5555/12345679), and this is how DOIs are typically resolved. For this reason the [CrossRef DOI Display Guidelines](http://www.crossref.org/02publishers/doi_display_guidelines.htm) recommend that *CrossRef DOIs should always be displayed as permanent URLs in the online environment*. Because DOIs can be expressed as URLs, they also have their features:
+#### Special characters
+Because DOIs can be expressed as URLs, DOIs [should only include characters allowed in URLs](http://www.crossref.org/02publishers/15doi_guidelines.html), something that wasn't always true in the past and can cause problems, e.g. when using SICIs ([Serial Item and Contribution Identifier](https://en.wikipedia.org/wiki/Serial_Item_and_Contribution_Identifier)), an extension of the ISSN for journals:
+```
+10.4567/0361-9230(1997)42:<OaEoSR>2.0.TX;2-B
+```
+#### Content negotiation
+The DOI resolver at *doi.org* (or *dx.doi.org*) normally resolves to the resource location, e.g. a landing page at a publisher website. Requests that are not for content type `text/html` are redirected to the registration agency metadata service (currently for CrossRef, DataCite and mEDRA DOIs). Using [content negotiation](http://www.crosscite.org/cn/), we can ask the metadata service to send us the metadata in a format we specify (e.g. Citeproc JSON, bibtex or even a formatted citation in one of thousands of citation styles) instead of getting redirected to the resource. This is a great way to collect bibliographic information, e.g. to format citations for a manuscript. In theory we could also use content negotiation to get a particular representation of a resource, e.g. `application/pdf` for a PDF of a paper or `text/csv` for a dataset in CSV format. This is not widely support and I don't know the details of the implementation in the DOI resolver, but you can try this (content negotation is easier with the command line than with a browser):
+```
+curl -LH "Accept: application/pdf" http://dx.doi.org/10.7717/peerj.500 >peerj.500.pdf
+```
+This will save the PDF of the 500th PeerJ paper published last week.
+#### Fragment identifiers
+As discussed in [my last blog post](http://blog.martinfenner.org/2014/08/02/fragment-identifiers-and-dois/), we can use frament identifiers to subsections of a document with DOIs, e.g. [http://dx.doi.org/10.1371/journal.pone.0103437#s2](http://dx.doi.org/10.1371/journal.pone.0103437#s2) or [http://doi.org/10.5446/12780#t=00:20,00:27](http://doi.org/10.5446/12780#t=00:20,00:27), just as we can with every other URL. This is a nice way to directly link to a specific document section, e.g. when discussing a paper on Twitter. Fragment identifiers are implemented by the client (typically web browser) and depend on the document type, but for DOIs that resolve to fulltext HTML documents they can add granularity to the DOI without much effort.
+#### Queries
+URLs obviously support queries, but that is a feature I haven't yet seen with DOIs. Queries would allow interesting features, partly overlapping with what is possible with fragment identifiers and content negotiation, e.g. `http://dx.doi.org/10.7717/peerj.500?format=pdf`. II hope to find out more until Sunday.
+### Outlook
+My biggest wish? Make DOIs more machine-readable. They are primarily intended for human users, enabling them to find the content associated with a DOI. But they sometimes don't work as well as they could with automated tools, one example are the [challenges automatically resolving a DOI](http://blog.martinfenner.org/2013/10/13/broken-dois/) that I described in a blog post last year. Thinking about DOIs as URLs - and using them this way - is the right direction.

data/examples/fenner_2011.docx ADDED

Binary file

data/examples/fenner_2013.docx ADDED

Binary file

data/filters/caps.rb ADDED

@@ -0,0 +1,12 @@
+# Pandoc filter to convert all regular text to uppercase.
+# Code, link URLs, etc. are not affected.
+# Adapted from Python example at https://github.com/jgm/pandocfilters/blob/master/examples/caps.py
+module Rakali::Filters::Caps
+  def caps(key, value, format, meta)
+    if key == 'Str'
+      value.upcase
+    end
+  end
+end

data/filters/default.rb ADDED

@@ -0,0 +1,12 @@
+# Pandoc filter to convert all regular text to uppercase.
+# Code, link URLs, etc. are not affected.
+# Adapted from Python example at https://github.com/jgm/pandocfilters/blob/master/examples/caps.py
+module Rakali::Filters::Default
+  def default(key, value, format, meta)
+    if key == 'Str'
+      value.upcase
+    end
+  end
+end

data/lib/rakali/converter.rb CHANGED

@@ -7,8 +7,10 @@ module Rakali
     DEFAULTS = {
       'from'          => { 'format' => 'md' },
       'to'            => { 'folder' => nil, 'format' => 'html' },
-      'schema'        => 'schemata/default.json',
-      'strict'        => false
+      'schema'        => 'default.json',
+      'citations'     => false,
+      'strict'        => false,
+      'merge'         => false
     }
     attr_accessor :config, :documents, :errors
@@ -26,7 +28,14 @@ module Rakali
         from_folder = @config.fetch('from').fetch('folder')
         from_format = @config.fetch('from').fetch('format')
         documents = Dir.glob("#{from_folder}/*.#{from_format}")
-        documents.each { |document| Rakali::Document.new(document, @config) }
+        # merge all documents into one file if merge flag is set
+        # otherwise iterate through each file in source folder
+        if @config.fetch('merge')
+          Rakali::Document.new(documents, @config)
+        else
+          documents.each { |document| Rakali::Document.new(document, @config) }
+        end
       rescue KeyError => e
         Rakali.logger.abort_with "Fatal:", "Configuration #{e.message}."
       rescue => e

data/lib/rakali/document.rb CHANGED

@@ -14,21 +14,32 @@ module Rakali
         @to_folder = @config.fetch('to').fetch('folder') || @from_folder
         @to_format = @config.fetch('to').fetch('format')
-        # for destination filename use source name with new extension
-        @source = File.basename(document)
-        @destination = @source.sub(/\.#{@from_format}$/, ".#{@to_format}")
+        # if document is a list of files, concatenate into one input
+        # use to_folder name as filename
+        if document.is_a?(Array)
+          @source = document.map { |file| File.basename(file) }.join(" ")
+          @destination = "#{File.basename(@from_folder)}.#{@to_format}"
+          puts @destination
+        else
+          # otherwise use source name with new extension for destination filename
+          @source = File.basename(document)
+          @destination = @source.sub(/\.#{@from_format}$/, ".#{@to_format}")
+        end
+        # use citeproc-pandoc if citations flag is set
+        bibliography = @config.fetch('citations') ? "-f citeproc-pandoc" : ""
         # convert source document into JSON version of native AST
-        @content = convert(nil, @from_folder, "#{@source} -t json")
+        @content = convert(nil, @from_folder, "#{@source} #{bibliography}-t json")
-        # read in JSON schema
-        @schema = IO.read(@config.fetch('schema'))
+        # read in JSON schema, use included schemata folder if no folder is given
+        @schema = scheme
         # validate JSON against schema and report errors
         @errors = validate
         # convert to destination document from JSON version of native AST
-        @output = convert(@content, @to_folder, "-f json -o #{@destination}")
+        @output = convert(@content, @to_folder, "-f json #{bibliography}-o #{@destination}")
         Rakali.logger.abort_with "Fatal:", "Writing file #{@destination} failed" unless created?
         if @errors.empty?
@@ -62,6 +73,16 @@ module Rakali
       captured_stdout
     end
+    def scheme
+      schema = @config.fetch('schema')
+      if schema.include?("/")
+        IO.read(schema)
+      else
+        schemata_folder = File.expand_path("../../../schemata", __FILE__)
+        IO.read("#{schemata_folder}/#{schema}")
+      end
+    end
     def validate
       errors = JSON::Validator.fully_validate(@schema, @content)
       return [] if errors.empty?

data/lib/rakali/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Rakali
-  VERSION = "0.0.15"
+  VERSION = "0.0.17"
 end

data/rakali.gemspec ADDED

@@ -0,0 +1,28 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path("../lib/rakali/version", __FILE__)
+Gem::Specification.new do |s|
+  s.name        = 'rakali'
+  s.version     = Rakali::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Martin Fenner"]
+  s.email       = 'mf@martinfenner.org'
+  s.homepage    = 'https://github.com/rakali/rakali.rb'
+  s.summary     = 'A Pandoc command-line wrapper'
+  s.description = 'A Pandoc command-line wrapper written in Ruby.'
+  s.license     = 'MIT'
+  s.required_rubygems_version = ">= 1.3.6"
+  s.add_dependency 'thor', '~> 0.19'
+  s.add_dependency 'json-schema', '~> 2.2'
+  s.add_dependency 'safe_yaml', "~> 1.0"
+  s.add_dependency 'colorator', "~> 0.1"
+  s.add_development_dependency 'rake', '~> 0'
+  s.add_development_dependency "rspec", '~> 2.6'
+  s.add_development_dependency "cucumber", '~> 1.3'
+  s.add_development_dependency "aruba", '~> 0'
+  s.files       = `git ls-files`.split($/)
+  s.executables = ["rakali"]
+end

data/schemata/citeproc.json ADDED

@@ -0,0 +1,107 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Citeproc",
+  "description": "Included citations using Citeproc and the Citation Style Language (CSL)",
+  "definitions": {
+    "references": {
+      "type": "object",
+      "properties": {
+        "c": {
+          "type": "array",
+          "items": [
+            {
+              "type": "object",
+              "properties": {
+                "c": {
+                  "type": "object",
+                  "properties": {
+                    "id": {"type": "object"},
+                    "type": {"type": "object"},
+                    "author": {"type": "object"},
+                    "title": {"type": "object"},
+                    "container-title": {"type": "object"},
+                    "issued": { "$ref": "#/definitions/date-parts" },
+                    "page": {"type": "object"},
+                    "volume": {"type": "object"}
+                  },
+                  "required": ["id","issued"]
+                }
+              }
+            }
+          ]
+        }
+      }
+    },
+    "date-parts": {
+      "type": "object",
+      "properties": {
+        "c": {
+          "type": "object",
+          "properties": {
+            "year": {"type": "object"},
+            "month": {"type": "object"},
+            "day": {"type": "object"}
+          },
+          "required": ["year"]
+        }
+      }
+    }
+  },
+  "type": "array",
+  "items": [
+    {
+      "type": "object",
+      "properties": {
+        "unMeta": {
+          "type": "object",
+          "properties": {
+            "bibliography": {"type": "object"},
+            "references": { "$ref": "#/definitions/references" },
+            "csl": {"type": "object"},
+            "citation-abbreviations": {"type": "object"}
+          },
+          "anyOf": [
+            {
+              "bibliography": {"type": "object"},
+              "required": ["bibliography"]
+            },
+            {
+              "references": { "$ref": "#/definitions/references" },
+              "required": ["references"]
+            }
+          ]
+        }
+      }
+    },
+    {
+      "type": "array",
+      "items": [
+        {
+          "type": "object",
+          "properties": {
+            "t": {"enum": ["Div","Header"]},
+            "c": {
+              "type": "array",
+              "items": [
+                {
+                  "type": "number"
+                },
+                {
+                  "type": "array",
+                  "items": [
+                    {
+                      "type": "string",
+                      "enum": ["abstract","references"]
+                    }
+                  ]
+                }
+              ]
+            }
+          }
+        }
+      ]
+    }
+  ]
+}

data/schemata/default.json ADDED

@@ -0,0 +1,19 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Default",
+  "description": "The default Pandoc schema",
+  "type": "array",
+  "items": [
+    {
+      "type": "object",
+      "properties": {
+        "unMeta": {
+          "type": "object"
+        }
+      }
+    },
+    {
+      "type": "array"
+    }
+  ]
+}

data/schemata/jats.json ADDED

@@ -0,0 +1,72 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "JATS",
+  "description": "Journal Article Tagging Suite (JATS)",
+  "definitions": {
+    "authors": {
+      "type": "object"
+    },
+    "article": {
+      "type": "object",
+       "properties": {
+        "c": {
+          "type": "object",
+          "properties": {
+            "doi": {"type": "object"}
+          },
+          "required": ["doi"]
+        }
+      }
+    },
+    "journal": {
+      "type": "object",
+      "properties": {
+        "c": {
+          "type": "object",
+          "properties": {
+            "title":        {"type": "object"},
+            "eissn":        {"type": "object"},
+            "publisher-id": {"type": "object"}
+          },
+          "required": ["title","eissn","publisher-id"]
+        }
+      }
+    },
+    "publisher": {
+      "type": "object",
+      "properties": {
+        "c": {
+          "type": "object",
+          "properties": {
+            "name": {"type": "object"}
+          },
+          "required": ["name"]
+        }
+      }
+    }
+  },
+  "type": "array",
+  "items": [
+    {
+      "type": "object",
+      "properties": {
+        "unMeta": {
+          "type": "object",
+          "properties": {
+            "title": {"type": "object"},
+            "authors": { "$ref": "#/definitions/authors" },
+            "article": { "$ref": "#/definitions/article" },
+            "journal": { "$ref": "#/definitions/journal" },
+            "publisher": { "$ref": "#/definitions/publisher" }
+          },
+          "required": ["title","authors","article","journal","publisher"]
+        }
+      }
+    },
+    {
+      "type": "array"
+    }
+  ]
+}

data/schemata/jekyll.json ADDED

@@ -0,0 +1,35 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Jekyll",
+  "description": "Jekyll static site generator",
+  "definitions": {
+    "tags": {
+      "type": "object",
+      "properties": {
+        "c": {"type": "array"}
+      }
+    }
+  },
+  "type": "array",
+  "items": [
+    {
+      "type": "object",
+      "properties": {
+        "unMeta": {
+          "type": "object",
+          "properties": {
+            "title": {"type": "object"},
+            "layout": {"type": "object"},
+            "tags": { "$ref": "#/definitions/tags" }
+          },
+          "required": ["title","layout"]
+        }
+      }
+    },
+    {
+      "type": "array"
+    }
+  ]
+}

data/spec/converter_spec.rb ADDED

@@ -0,0 +1,39 @@
+require 'spec_helper'
+describe Rakali::Converter do
+  describe "config" do
+    it "should read the default config" do
+      config = Rakali::Converter::DEFAULTS
+      config.fetch('from').fetch('format').should eq('md')
+    end
+    it "should read the config via file" do
+      file = fixture_path + 'docx.yml'
+      subject = Rakali::Converter.new(file)
+      subject.config.fetch('from').fetch('folder').should eq('minimal')
+      subject.config.fetch('from').fetch('format').should eq('docx')
+    end
+    it "should merge default format" do
+      file = fixture_path + 'only_folder_key.yml'
+      subject = Rakali::Converter.new(file)
+      subject.config.fetch('from').fetch('folder').should eq('minimal')
+      subject.config.fetch('from').fetch('format').should eq('md')
+    end
+    it "should raise an error when the config file doesn't exist" do
+      file = fixture_path + 'x'
+      lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
+    end
+    it "should raise an error when the config file is empty" do
+      file = fixture_path + 'empty.yml'
+      lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
+    end
+    it "should raise an error when the \"from\" key config doesn't exist" do
+      file = fixture_path + 'no_from_key.yml'
+      lambda { Rakali::Converter.new(file) }.should raise_error SystemExit
+    end
+  end
+end

data/spec/document_spec.rb ADDED

@@ -0,0 +1,63 @@
+require 'spec_helper'
+describe Rakali::Document do
+  describe "convert" do
+    it "should convert minimal input" do
+      document = fixture_path + 'minimal.md'
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path }, 'to' => { 'format' => 'docx' } })
+      subject = Rakali::Document.new(document, config)
+      subject.valid?.should be_truthy
+      subject.errors.should be_empty
+      subject.created?.should be_truthy
+    end
+    it "should convert multiple files" do
+      documents = Dir.glob("#{fixture_path}/*.md")
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path }, 'to' => { 'format' => 'epub' }, 'merge' => true })
+      subject = Rakali::Document.new(documents, config)
+      subject.valid?.should be_truthy
+      subject.errors.should be_empty
+      subject.created?.should be_truthy
+    end
+  end
+  describe "validate" do
+    it "should validate with empty input" do
+      document = fixture_path + 'empty.md'
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path } })
+      subject = Rakali::Document.new(document, config)
+      subject.valid?.should be_truthy
+      subject.errors.should be_empty
+    end
+    it "should not validate with empty input and extended schema" do
+      document = fixture_path + 'empty.md'
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json' })
+      subject = Rakali::Document.new(document, config)
+      subject.valid?.should be_falsey
+      subject.errors.length.should == 2
+      subject.errors.first.should match("The property '#/0/unMeta' did not contain a required property of 'title'")
+      subject.errors.last.should match("The property '#/0/unMeta' did not contain a required property of 'layout'")
+    end
+    it "should not validate with empty input and extended schema and raise error" do
+      document = fixture_path + 'empty.md'
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json', 'strict' => true })
+      lambda { Rakali::Document.new(document, config) }.should raise_error SystemExit
+    end
+    it "should validate with extended input and extended schema" do
+      document = fixture_path + 'jekyll.md'
+      config = Rakali::Utils.deep_merge_hashes(Rakali::Converter::DEFAULTS,
+        { 'from' => { 'folder' => fixture_path }, 'schema' => 'jekyll.json' })
+      subject = Rakali::Document.new(document, config)
+      subject.valid?.should be_truthy
+      subject.errors.should be_empty
+    end
+  end
+end

data/spec/fixtures/docx.yml ADDED

@@ -0,0 +1,3 @@
+from:
+  folder: minimal
+  format: docx

data/spec/fixtures/empty.md ADDED

File without changes

data/spec/fixtures/empty.yml ADDED

	@@ -0,0 +1 @@
1	+

data/spec/fixtures/incomplete.yml ADDED

@@ -0,0 +1,4 @@
+from:
+  folder: fixtures/minimal
+  format: md
+schema: jekyll.json

data/spec/fixtures/jekyll.md ADDED

@@ -0,0 +1,8 @@
+---
+layout: post
+title: "Nine simple ways to make it easier to (re)use your data"
+---
+# Title
+This is a **test**.

data/spec/fixtures/merge.md ADDED

@@ -0,0 +1,7 @@
+## Merge Title
+This is another document and has a list.
+* one
+* two
+* three

data/spec/fixtures/minimal.md ADDED

@@ -0,0 +1,3 @@
+# title
+This is a **test**.

data/spec/fixtures/no_from_key.yml ADDED

	@@ -0,0 +1 @@
1	+ schema: default.json

data/spec/fixtures/only_folder_key.yml ADDED

	@@ -0,0 +1,2 @@
1	+ from:
2	+ folder: minimal

data/spec/logger_spec.rb ADDED

@@ -0,0 +1,101 @@
+require 'spec_helper'
+describe Rakali::Logger do
+  let(:topic) { "Topic:" }
+  let(:message) { "This is the message." }
+  let(:output) { "             #{topic} #{message}" }
+  describe "debug" do
+    subject { Rakali::Logger.new(Rakali::Logger::DEBUG) }
+    it "initialize" do
+      subject.log_level.should == 0
+    end
+    it "debug" do
+      capture_stdout { subject.debug topic, message }.should start_with(output)
+    end
+    it "info" do
+      capture_stdout { subject.info topic, message }.should start_with(output)
+    end
+    it "warn" do
+      capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
+    end
+    it "error" do
+      capture_stderr { subject.error topic, message }.should start_with(output.red)
+    end
+  end
+  describe "info" do
+    it "initialize" do
+      subject.log_level.should == 1
+    end
+    it "debug" do
+      capture_stdout { subject.debug topic, message }.should eq("")
+    end
+    it "info" do
+      capture_stdout { subject.info topic, message }.should start_with(output)
+    end
+    it "warn" do
+      capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
+    end
+    it "error" do
+      capture_stderr { subject.error topic, message }.should start_with(output.red)
+    end
+  end
+  describe "warn" do
+    subject { Rakali::Logger.new(Rakali::Logger::WARN) }
+    it "initialize" do
+      subject.log_level.should == 2
+    end
+    it "debug" do
+      capture_stdout { subject.debug topic, message }.should eq("")
+    end
+    it "info" do
+      capture_stdout { subject.info topic, message }.should eq("")
+    end
+    it "warn" do
+      capture_stderr { subject.warn topic, message }.should start_with(output.yellow)
+    end
+    it "error" do
+      capture_stderr { subject.error topic, message }.should start_with(output.red)
+    end
+  end
+  describe "error" do
+    subject { Rakali::Logger.new(Rakali::Logger::ERROR) }
+    it "initialize" do
+      subject.log_level.should == 3
+    end
+    it "debug" do
+      capture_stdout { subject.debug topic, message }.should eq("")
+    end
+    it "info" do
+      capture_stdout { subject.info topic, message }.should eq("")
+    end
+    it "warn" do
+      capture_stderr { subject.warn topic, message }.should eq("")
+    end
+    it "error" do
+      capture_stderr { subject.error topic, message }.should start_with(output.red)
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,62 @@
+require 'bundler/setup'
+Bundler.setup
+require 'rakali'
+RSpec.configure do |config|
+  config.before do
+    ARGV.replace []
+  end
+  def fixture_path
+    File.expand_path("../fixtures", __FILE__) + '/'
+  end
+  # This code was adapted from Thor, available under MIT-LICENSE
+  # Copyright (c) 2008 Yehuda Katz, Eric Hodel, et al.
+  def capture(stream)
+    begin
+      stream = stream.to_s
+      eval "$#{stream} = StringIO.new"
+      yield
+      result = eval("$#{stream}").string
+    ensure
+      eval("$#{stream} = #{stream.upcase}")
+    end
+    result
+  end
+  def capture_stdout(&block)
+    original_stdout = $stdout
+    $stdout = fake = StringIO.new
+    begin
+      yield
+    ensure
+      $stdout = original_stdout
+    end
+    fake.string
+  end
+  def capture_stderr(&block)
+    original_stderr = $stderr
+    $stderr = fake = StringIO.new
+    begin
+      yield
+    ensure
+      $stderr = original_stderr
+    end
+    fake.string
+  end
+  # This code was adapted from Ruby on Rails, available under MIT-LICENSE
+  # Copyright (c) 2004-2013 David Heinemeier Hansson
+  def silence_warnings
+    old_verbose, $VERBOSE = $VERBOSE, nil
+    yield
+  ensure
+    $VERBOSE = old_verbose
+  end
+  alias silence capture
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rakali
 version: !ruby/object:Gem::Version
-  version: 0.0.15
+  version: 0.0.17
 platform: ruby
 authors:
 - Martin Fenner
@@ -129,7 +129,23 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
+- ".gitignore"
+- ".rakali.yml"
+- ".travis.yml"
+- Gemfile
+- Gemfile.lock
+- LICENSE
+- README.md
+- Rakefile
 - bin/rakali
+- examples/2013-11-17-the-grammar-of-scholarly-communication.md
+- examples/2013-12-12-from-markdown-to-jats-xml-in-one-step.md
+- examples/2014-07-24-dont-reinvent-the-wheel.md
+- examples/2014-08-06-what-is-doi.md
+- examples/fenner_2011.docx
+- examples/fenner_2013.docx
+- filters/caps.rb
+- filters/default.rb
 - lib/rakali.rb
 - lib/rakali/cli.rb
 - lib/rakali/converter.rb
@@ -137,6 +153,24 @@ files:
 - lib/rakali/logger.rb
 - lib/rakali/utils.rb
 - lib/rakali/version.rb
+- rakali.gemspec
+- schemata/citeproc.json
+- schemata/default.json
+- schemata/jats.json
+- schemata/jekyll.json
+- spec/converter_spec.rb
+- spec/document_spec.rb
+- spec/fixtures/docx.yml
+- spec/fixtures/empty.md
+- spec/fixtures/empty.yml
+- spec/fixtures/incomplete.yml
+- spec/fixtures/jekyll.md
+- spec/fixtures/merge.md
+- spec/fixtures/minimal.md
+- spec/fixtures/no_from_key.yml
+- spec/fixtures/only_folder_key.yml
+- spec/logger_spec.rb
+- spec/spec_helper.rb
 homepage: https://github.com/rakali/rakali.rb
 licenses:
 - MIT