bibsync 0.0.5 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 656b20418877066a48f7e30f3f702b47e71c7a5d
4
- data.tar.gz: af9f5d4f088255dccc86bd12655a009830050ac3
3
+ metadata.gz: 5578a98ae327ca6b8ee4b5fb0da4b2bbd1786116
4
+ data.tar.gz: eb56b9ab2ce3d9f1b6599da6a1b4ca0c76db220e
5
5
  SHA512:
6
- metadata.gz: 55e6153fa4ffa968cbf1f08dba256c2a0c3b427f6a84babc16d99e62bb519e2450335191dd6ce0c4bc8eac1a830d1ba27999215a517b35feb24f4eda2ed92cd5
7
- data.tar.gz: 4f9a2308012649be07f5bf3c8549c1a9ae9a1c0410e8c1f33de47e7991dcd1f067e83ea3e96a5c67e518a7b67980ad358bfa24f641a9f6f12740bff76da35449
6
+ metadata.gz: 3050a0ec740223617d6f700bca7725443e3ab2b1a1cce7ef46c6091ed777d18b3bdc3f6f9ad17abcce3b40f923959069960d977128c112cfbe84e933a3bcfbcd
7
+ data.tar.gz: d4692dcd5ce86c2cfaf6698a7c77ca5450487067077ba87804325b54cf77a2e7654ae025914819476be79f649e56d21e1c5e056402e46dc02a5d8a3113ed0a57
data/.gitignore CHANGED
@@ -1,6 +1,7 @@
1
1
  *.swp
2
2
  *.gem
3
+ .#*
3
4
  Gemfile.lock
4
5
  .bundle
5
6
  .yardoc
6
-
7
+ test/tmp
@@ -8,4 +8,6 @@ rvm:
8
8
  before_install:
9
9
  - sudo apt-get update -qq
10
10
  - sudo apt-get install -qq poppler-utils jabref
11
-
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: ruby-head
data/README.md CHANGED
@@ -27,11 +27,21 @@ BibSync supports the following features:
27
27
  * Downloading of new versions of [arXiv](http://arxiv.org/) papers
28
28
  * Simple validation of [BibTeX](http://en.wikipedia.org/wiki/BibTeX) files (Checks for missing fields etc)
29
29
  * Simple transformation of [BibTeX](http://en.wikipedia.org/wiki/BibTeX) fields (Normalization of author, year and journal field...)
30
- * Works under every platform supporting Ruby (Linux, Windows, ...)
30
+ * Works under every platform supporting Ruby and `pdftotext` (Linux, Windows, ...)
31
31
 
32
32
  Quick start
33
33
  -----------
34
34
 
35
+ At first you have to ensure that you have the `pdftotext` program available on your `$PATH`. Under Debian you can install
36
+ the package using `apt-get` as follows
37
+
38
+ ~~~
39
+ $ apt-get install poppler-utils
40
+ $ pdftotext
41
+ pdftotext version 0.24.1
42
+ ...
43
+ ~~~
44
+
35
45
  BibSync requires Ruby >= 1.9.2 to run. It is distributed as a RubyGems package. You can install it via
36
46
  the command line
37
47
 
@@ -39,6 +49,12 @@ the command line
39
49
  $ gem install bibsync
40
50
  ~~~
41
51
 
52
+ And for updating, you write
53
+
54
+ ~~~
55
+ $ gem update bibsync
56
+ ~~~
57
+
42
58
  After that you can use the 'bibsync' tool on the command line. At first let's validate
43
59
  a [BibTeX](http://en.wikipedia.org/wiki/BibTeX) file called 'thesis.bib'.
44
60
 
@@ -9,15 +9,15 @@ Gem::Specification.new do |s|
9
9
  s.authors = ['Daniel Mendler']
10
10
  s.email = ['mail@daniel-mendler.de']
11
11
  s.summary = 'BibSync is a tool to synchronize scientific papers and BibTeX bibliography files'
12
- s.description = 'BibSync is a tool to synchronize scientific papers and BibTeX bibliography files'
12
+ s.description = 'BibSync is a tool to synchronize scientific papers and BibTeX bibliography files. It automatically downloads the metadata from dx.doi.org and arxiv.org.'
13
13
  s.homepage = 'https://github.com/minad/bibsync'
14
14
  s.rubyforge_project = s.name
15
+ s.license = 'MIT'
15
16
 
16
17
  s.files = `git ls-files`.split("\n")
17
18
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
19
  s.require_paths = %w(lib)
19
20
 
20
- s.add_runtime_dependency('nokogiri')
21
21
  s.add_runtime_dependency('faraday')
22
22
  s.add_runtime_dependency('faraday_middleware')
23
23
  s.add_development_dependency('rake')
@@ -1,10 +1,10 @@
1
- require 'nokogiri'
2
1
  require 'faraday'
3
2
  require 'faraday_middleware'
4
3
  require 'shellwords'
5
4
  require 'date'
6
5
  require 'pathname'
7
6
  require 'forwardable'
7
+ require 'rexml/document'
8
8
  require 'bibsync/version'
9
9
  require 'bibsync/utils'
10
10
  require 'bibsync/log'
@@ -17,8 +17,8 @@ module BibSync
17
17
  @bib.select {|e| e[:arxiv] }.each_slice(SliceSize) do |entry|
18
18
  begin
19
19
  xml = fetch_xml('http://export.arxiv.org/api/query', id_list: entry.map{|e| arxiv_id(e, version: false, prefix: true) }.join(','), max_results: SliceSize)
20
- xml.xpath('//entry/id').map(&:content).each_with_index do |id, i|
21
- id.gsub!('http://arxiv.org/abs/', '')
20
+ xml.get_elements('//entry').each_with_index do |e, i|
21
+ id = e.elements['id'].text.gsub('http://arxiv.org/abs/', '')
22
22
  if id != entry[i][:arxiv]
23
23
  info("#{entry[i][:arxiv]} replaced by http://arxiv.org/pdf/#{id}", key: entry[i])
24
24
  arxiv_download(@dir, id) if @update
@@ -18,8 +18,6 @@ module BibSync
18
18
  (!@force && entry[:title] && entry[:author] && entry[:year])
19
19
 
20
20
  determine_arxiv_and_doi(entry)
21
-
22
- @bib.save
23
21
  end
24
22
  end
25
23
 
@@ -49,8 +47,9 @@ module BibSync
49
47
  begin
50
48
  info('Fetch missing arXiv identifier', key: entry)
51
49
  xml = fetch_xml('http://export.arxiv.org/api/query', search_query: "doi:#{entry[:doi]}", max_results: 1)
52
- if xml.xpath('//entry/doi').map(&:content).first == entry[:doi]
53
- id = xml.xpath('//entry/id').map(&:content).first
50
+ doi = xml.elements['//arxiv:doi']
51
+ if doi && doi.text == entry[:doi]
52
+ id = xml.elements['//entry/id'].text
54
53
  if id =~ %r{\Ahttp://arxiv.org/abs/(.+)\Z}
55
54
  entry[:arxiv] = $1
56
55
  end
@@ -18,7 +18,7 @@ module BibSync
18
18
  @fetch.each do |url|
19
19
  if url =~ /\A(\d+\.\d+)(v\d+)?\Z/
20
20
  arxivs << $1
21
- elsif url =~ %r{\Ahttp://arxiv.org/abs/(\d+\.\d+)\Z}
21
+ elsif url =~ %r{\Ahttp://arxiv.org/abs/(\d+\.\d+)(v\d+)?\Z}
22
22
  arxivs << $1
23
23
  else
24
24
  urls << url
@@ -38,8 +38,8 @@ module BibSync
38
38
  arxivs.each_slice(SliceSize) do |ids|
39
39
  begin
40
40
  xml = fetch_xml('http://export.arxiv.org/api/query', id_list: ids.join(','), max_results: SliceSize)
41
- xml.xpath('//entry/id').map(&:content).each_with_index do |id, i|
42
- id.gsub!('http://arxiv.org/abs/', '')
41
+ xml.each_element('//entry/id') do |id|
42
+ id = id.text.gsub('http://arxiv.org/abs/', '')
43
43
  info 'arXiv download', key: id
44
44
  arxiv_download(@dir, id)
45
45
  end
@@ -38,8 +38,6 @@ module BibSync
38
38
  warning("Cited in #{files} but not found in #{@bib.file}", key: key)
39
39
  end
40
40
  end
41
-
42
- @bib.save
43
41
  end
44
42
  end
45
43
  end
@@ -32,8 +32,6 @@ module BibSync
32
32
  entry.type ||= :ARTICLE
33
33
  entry.file = file
34
34
  end
35
-
36
- @bib.save
37
35
  end
38
36
  end
39
37
  end
@@ -15,6 +15,8 @@ module BibSync
15
15
  @bib.to_a.each do |entry|
16
16
  next if entry.comment?
17
17
 
18
+ entry.delete(:abstract) if @force
19
+
18
20
  if @force || !(entry[:title] && entry[:author] && entry[:year])
19
21
  if entry[:arxiv]
20
22
  if entry.key == arxiv_id(entry, prefix: false, version: true)
@@ -27,11 +29,12 @@ module BibSync
27
29
  update_doi(entry) if entry[:doi]
28
30
  end
29
31
 
30
- if entry[:doi] =~ /\A10\.1103\// && (@force || !entry[:abstract])
32
+ if entry[:doi] =~ /\A10\.1103\// && !entry[:abstract]
31
33
  update_aps_abstract(entry)
32
34
  end
33
35
 
34
- @bib.save
36
+ # Add timestamp when this entry was added
37
+ entry[:added] ||= Date.today.to_s
35
38
  end
36
39
  end
37
40
 
@@ -39,20 +42,34 @@ module BibSync
39
42
 
40
43
  def update_aps_abstract(entry)
41
44
  info("Downloading APS abstract", key: entry)
42
- html = fetch_html("http://link.aps.org/doi/#{entry[:doi]}")
43
- entry[:abstract] = html.css('.aps-abstractbox').map(&:content).first
45
+ html = fetch("http://link.aps.org/doi/#{entry[:doi]}")
46
+ if html =~ %r{<div class='aps-abstractbox'>(.*?)</div>}
47
+ entry[:abstract] = $1.gsub(/<[^>]+>/, '')
48
+ end
44
49
  rescue => ex
45
50
  error('Abstract download failed', key: entry, ex: ex)
46
51
  end
47
52
 
48
53
  def update_doi(entry)
49
- info('Downloading DOI metadata', key: entry)
50
- text = fetch("http://dx.doi.org/#{entry[:doi]}", nil, 'Accept' => 'text/bibliography; style=bibtex')
54
+ url = "http://dx.doi.org/#{entry[:doi]}"
55
+ info("Downloading DOI metadata from #{url}", key: entry)
56
+ text = fetch(url, nil, 'Accept' => 'text/bibliography; style=bibtex')
51
57
  raise text if text == 'Unknown DOI'
52
58
  Entry.parse(text).each {|k, v| entry[k] = v }
53
59
  rescue => ex
54
- entry.delete(:doi)
55
60
  error('DOI download failed', key: entry, ex: ex)
61
+ # dx.doi.org shows spurious 500 errors
62
+ if ex.respond_to?(:response) && ex.response[:status] == 500
63
+ tries ||= 0
64
+ tries += 1
65
+ if tries < 10
66
+ info('Retrying...', key: entry)
67
+ retry
68
+ else
69
+ error('Giving up :(', key: entry)
70
+ end
71
+ end
72
+ entry.delete(:doi)
56
73
  end
57
74
 
58
75
  # Rename arxiv file if key contains version
@@ -89,36 +106,36 @@ module BibSync
89
106
  entry.file = new_path
90
107
  end
91
108
 
92
- @bib.save
93
-
94
109
  entry
95
110
  end
96
111
 
97
112
  def update_arxiv(entry)
98
113
  info('Downloading arXiv metadata', key: entry)
114
+
99
115
  xml = fetch_xml('http://export.arxiv.org/oai2', verb: 'GetRecord', identifier: "oai:arXiv.org:#{arxiv_id(entry, prefix: true, version: false)}", metadataPrefix: 'arXiv')
100
- error = xml.xpath('//error').map(&:content).first
101
- raise error if error
102
-
103
- entry[:title] = xml.xpath('//arXiv/title').map(&:content).first
104
- entry[:abstract] = xml.xpath('//arXiv/abstract').map(&:content).first
105
- entry[:primaryclass] = xml.xpath('//arXiv/categories').map(&:content).first.split(/\s+/).first
106
- entry[:author] = xml.xpath('//arXiv/authors/author').map do |author|
107
- "{#{author.xpath('keyname').map(&:content).first}}, {#{author.xpath('forenames').map(&:content).first}}"
116
+ error = xml.elements['//error']
117
+ raise error.text if error
118
+
119
+ arXiv = xml.elements['//arXiv']
120
+
121
+ entry[:title] = arXiv.elements['title'].text
122
+ entry[:abstract] = arXiv.elements['abstract'].text
123
+ entry[:arxivcategories] = arXiv.elements['categories'].text
124
+ entry[:primaryclass] = entry[:arxivcategories].split(/\s+/).first
125
+ entry[:author] = arXiv.get_elements('authors/author').map do |author|
126
+ "{#{author.elements['keyname'].text}}, {#{author.elements['forenames'].text}}"
108
127
  end.join(' and ')
109
128
  entry[:journal] = 'ArXiv e-prints'
110
129
  entry[:eprint] = entry[:arxiv]
111
130
  entry[:archiveprefix] = 'arXiv'
112
- date = xml.xpath('//arXiv/updated').map(&:content).first || xml.xpath('//arXiv/created').map(&:content).first
113
- date = Date.parse(date)
131
+ entry[:arxivcreated] = arXiv.elements['created'].text if arXiv.elements['created']
132
+ entry[:arxivupdated] = arXiv.elements['updated'].text if arXiv.elements['updated']
133
+ date = Date.parse(entry[:arxivupdated] || entry[:arxivcreated])
114
134
  entry[:year] = date.year
115
135
  entry[:month] = Literal.new(%w(jan feb mar apr may jun jul aug sep oct nov dec)[date.month - 1])
116
- doi = xml.xpath('//arXiv/doi').map(&:content).first
117
- entry[:doi] = doi if doi
118
- journal = xml.xpath('//arXiv/journal-ref').map(&:content).first
119
- entry[:journal] = journal if journal
120
- comments = xml.xpath('//arXiv/comments').map(&:content).first
121
- entry[:comments] = comments if comments
136
+ entry[:doi] = arXiv.elements['doi'].text if arXiv.elements['doi']
137
+ entry[:journal] = arXiv.elements['journal-ref'].text if arXiv.elements['journal-ref']
138
+ entry[:comments] = arXiv.elements['comments'].text if arXiv.elements['comments']
122
139
  entry[:url] = "http://arxiv.org/abs/#{entry[:arxiv]}"
123
140
  rescue => ex
124
141
  entry.delete(:arxiv)
@@ -55,8 +55,13 @@ module BibSync
55
55
  raise 'No filename given' unless @file
56
56
  if @dirty
57
57
  @save_hook.call(self) if @save_hook
58
- File.open("#{@file}.tmp", 'w') {|f| f.write(self) }
59
- File.rename("#{@file}.tmp", @file)
58
+ tmpfile = "#{@file}.tmp"
59
+ begin
60
+ File.open(tmpfile, 'w') {|f| f.write(self) }
61
+ File.rename(tmpfile, @file)
62
+ ensure
63
+ File.unlink(tmpfile) rescue nil
64
+ end
60
65
  @dirty = false
61
66
  true
62
67
  else
@@ -83,14 +83,15 @@ module BibSync
83
83
 
84
84
  def process
85
85
  if @args.size != 0
86
- error 'Too many arguments'
86
+ puts 'Too many arguments'
87
87
  puts @opts
88
88
  exit
89
89
  end
90
90
 
91
91
  if @options[:bib]
92
- @options[:bib] = Bibliography.new(@options[:bib])
93
- @options[:bib].save_hook = Transformer.new
92
+ bib = @options[:bib] = Bibliography.new(@options[:bib])
93
+ bib.save_hook = Transformer.new
94
+ at_exit { bib.save }
94
95
  end
95
96
 
96
97
  actions = []
@@ -27,13 +27,7 @@ module BibSync
27
27
  end
28
28
 
29
29
  def fetch_xml(url, params = nil, headers = nil)
30
- xml = Nokogiri::XML(fetch(url, params, headers))
31
- xml.remove_namespaces!
32
- xml
33
- end
34
-
35
- def fetch_html(url, params = nil, headers = nil)
36
- Nokogiri::HTML(fetch(url, params, headers))
30
+ REXML::Document.new(fetch(url, params, headers)).root
37
31
  end
38
32
 
39
33
  def arxiv_id(arxiv, opts = {})
@@ -1,3 +1,3 @@
1
1
  module BibSync
2
- VERSION = '0.0.5'
2
+ VERSION = '0.0.8'
3
3
  end
@@ -24,13 +24,7 @@ describe BibSync::Utils do
24
24
 
25
25
  describe '#fetch_xml' do
26
26
  it 'fetches xml' do
27
- fetch_xml('http://export.arxiv.org/oai2', verb: 'GetRecord', identifier: 'oai:arXiv.org:1208.2881', metadataPrefix: 'arXiv').must_be_instance_of Nokogiri::XML::Document
28
- end
29
- end
30
-
31
- describe '#fetch_html' do
32
- it 'fetches html' do
33
- fetch_html('http://google.com').must_be_instance_of Nokogiri::HTML::Document
27
+ fetch_xml('http://export.arxiv.org/oai2', verb: 'GetRecord', identifier: 'oai:arXiv.org:1208.2881', metadataPrefix: 'arXiv').must_be_instance_of REXML::Element
34
28
  end
35
29
  end
36
30
 
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bibsync
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Mendler
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-04 00:00:00.000000000 Z
11
+ date: 2013-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: nokogiri
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '>='
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - '>='
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: faraday
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -81,7 +67,7 @@ dependencies:
81
67
  - !ruby/object:Gem::Version
82
68
  version: '0'
83
69
  description: BibSync is a tool to synchronize scientific papers and BibTeX bibliography
84
- files
70
+ files. It automatically downloads the metadata from dx.doi.org and arxiv.org.
85
71
  email:
86
72
  - mail@daniel-mendler.de
87
73
  executables:
@@ -135,7 +121,8 @@ files:
135
121
  - test/test_entry.rb
136
122
  - test/test_utils.rb
137
123
  homepage: https://github.com/minad/bibsync
138
- licenses: []
124
+ licenses:
125
+ - MIT
139
126
  metadata: {}
140
127
  post_install_message:
141
128
  rdoc_options: []