arxivsync 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +53 -0
- data/Rakefile +1 -0
- data/arxivsync.gemspec +28 -0
- data/lib/arxivsync/downloader.rb +57 -0
- data/lib/arxivsync/parser.rb +67 -0
- data/lib/arxivsync/version.rb +3 -0
- data/lib/arxivsync/xmlarchive.rb +107 -0
- data/lib/arxivsync.rb +24 -0
- data/test/fixtures/406001.xml +23932 -0
- data/test/parser.rb +29 -0
- metadata +157 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jaiden Mispy
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# arxivsync-0.0.3
|
2
|
+
|
3
|
+
Ruby OAI interface for harvesting the arXiv. Can be used to store and update an XML mirror of paper metadata, and parse the XML into Ruby objects to allow conversion into a friendlier format.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```
|
8
|
+
gem install arxivsync
|
9
|
+
```
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
### Creating or updating an archive
|
14
|
+
|
15
|
+
```
|
16
|
+
archive = ArxivSync::XMLArchive.new("/home/foo/savedir")
|
17
|
+
archive.sync
|
18
|
+
```
|
19
|
+
|
20
|
+
Stores each XML response as an individual file, each containing up to 1000 records. Following an initial harvest, you can rerun this to add additional files containing all records since the last harvest.
|
21
|
+
|
22
|
+
Remember to leave at least a day between syncs-- the temporal granularity doesn't go any smaller than that!
|
23
|
+
|
24
|
+
### Reading from an archive
|
25
|
+
|
26
|
+
```
|
27
|
+
archive = ArxivSync::XMLArchive.new("/home/foo/savedir")
|
28
|
+
archive.read_metadata do |paper|
|
29
|
+
# Do stuff with paper
|
30
|
+
end
|
31
|
+
```
|
32
|
+
|
33
|
+
Parses the XML files using Nokogiri's SAX parser and yields Structs representing the metadata as it goes.
|
34
|
+
|
35
|
+
### Download and parse immediately
|
36
|
+
|
37
|
+
If you just want arxivsync to do the request-cycle and parsing bits but handle storage yourself:
|
38
|
+
|
39
|
+
```
|
40
|
+
ArxivSync.get_metadata(oai_params) do |resp, papers|
|
41
|
+
papers.each do |paper|
|
42
|
+
# Do stuff with paper
|
43
|
+
end
|
44
|
+
end
|
45
|
+
```
|
46
|
+
|
47
|
+
## Contributing
|
48
|
+
|
49
|
+
1. Fork it
|
50
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
51
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
52
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
53
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/arxivsync.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'arxivsync/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "arxivsync"
|
8
|
+
spec.version = ArxivSync::VERSION
|
9
|
+
spec.authors = ["Jaiden Mispy"]
|
10
|
+
spec.email = ["scirate@mispy.me"]
|
11
|
+
spec.description = %q{OAI interface for harvesting the arXiv database}
|
12
|
+
spec.summary = %q{OAI interface for harvesting the arXiv database}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "minitest"
|
24
|
+
|
25
|
+
spec.add_runtime_dependency "oai"
|
26
|
+
spec.add_runtime_dependency "nokogiri"
|
27
|
+
spec.add_runtime_dependency "ox", ">= 2.0.2"
|
28
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module ArxivSync
|
2
|
+
class Downloader
|
3
|
+
def initialize(initial_params={})
|
4
|
+
@initial_params = initial_params
|
5
|
+
unless @initial_params[:resumptionToken]
|
6
|
+
@initial_params[:metadataPrefix] ||= 'arXiv'
|
7
|
+
end
|
8
|
+
@last_params = nil
|
9
|
+
|
10
|
+
@oai = OAI::Client.new('http://export.arxiv.org/oai2', :parser => 'libxml')
|
11
|
+
end
|
12
|
+
|
13
|
+
def start(&b)
|
14
|
+
# Make the initial request
|
15
|
+
resp = make_request(@initial_params)
|
16
|
+
|
17
|
+
# Continue to make requests until the server stops sending
|
18
|
+
# resumption tokens
|
19
|
+
while true
|
20
|
+
if !resp.resumption_token || resp.resumption_token.empty?
|
21
|
+
if resp.doc.to_s.include?("Retry after 20 seconds") # Rate limitation
|
22
|
+
puts "Honoring 503 and sleeping for 20 seconds..."
|
23
|
+
sleep 20
|
24
|
+
resp = retry_request
|
25
|
+
else # No resumption_token and no retry should mean we're finished
|
26
|
+
b.call(resp)
|
27
|
+
puts "Finished archiving!"
|
28
|
+
break
|
29
|
+
end
|
30
|
+
else # We have a resumption_token, keep going!
|
31
|
+
b.call(resp)
|
32
|
+
resp = make_request(resumptionToken: resp.resumption_token)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
return self
|
37
|
+
end
|
38
|
+
|
39
|
+
def retry_request
|
40
|
+
make_request(@last_params)
|
41
|
+
end
|
42
|
+
|
43
|
+
def make_request(params)
|
44
|
+
puts "Making OAI request with params: #{params.inspect}"
|
45
|
+
|
46
|
+
@last_params = params.clone # list_records will nuke our params
|
47
|
+
|
48
|
+
begin
|
49
|
+
return @oai.list_records(params)
|
50
|
+
rescue Faraday::Error::TimeoutError
|
51
|
+
puts "Request timed out; retrying in 20 seconds"
|
52
|
+
sleep 20
|
53
|
+
return retry_request
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ArxivSync
|
2
|
+
Author = Struct.new(
|
3
|
+
:keyname, :forenames
|
4
|
+
)
|
5
|
+
|
6
|
+
Paper = Struct.new(
|
7
|
+
:id, :created, :updated, :title, :abstract, :authors,
|
8
|
+
:primary_category, :crosslists
|
9
|
+
)
|
10
|
+
|
11
|
+
class XMLParser < ::Ox::Sax
|
12
|
+
attr_accessor :papers
|
13
|
+
|
14
|
+
def start_element(name, attributes=[])
|
15
|
+
@el = name
|
16
|
+
case name
|
17
|
+
when :ListRecords
|
18
|
+
@papers = []
|
19
|
+
when :metadata
|
20
|
+
@model = Paper.new
|
21
|
+
@authors = []
|
22
|
+
when :author
|
23
|
+
@author = Author.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def clean(str)
|
28
|
+
str.gsub(/\s+/, ' ').strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def text(str)
|
32
|
+
case @el
|
33
|
+
when :id
|
34
|
+
@model.id = str
|
35
|
+
when :created
|
36
|
+
@model.created = Date.parse(str)
|
37
|
+
when :updated
|
38
|
+
@model.updated = Date.parse(str)
|
39
|
+
when :title
|
40
|
+
@model.title = clean(str)
|
41
|
+
when :abstract
|
42
|
+
@model.abstract = clean(str)
|
43
|
+
when :categories
|
44
|
+
@model.primary_category = str.split[0]
|
45
|
+
@model.crosslists = str.split.drop(1)
|
46
|
+
when :keyname
|
47
|
+
@author.keyname = str
|
48
|
+
when :forenames
|
49
|
+
@author.forenames = str
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def end_element(name)
|
54
|
+
case name
|
55
|
+
when :author
|
56
|
+
@authors.push(@author)
|
57
|
+
when :metadata # End of a paper entry
|
58
|
+
#@paper.updated_date ||= @paper.pubdate # If no separate updated date
|
59
|
+
#@paper.feed_id = Feed.get_or_create(@primary_category).id
|
60
|
+
@model.authors = @authors
|
61
|
+
|
62
|
+
@papers.push(@model)
|
63
|
+
end
|
64
|
+
@el = nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module ArxivSync
|
2
|
+
class XMLArchive
|
3
|
+
def initialize(savedir, custom_params=nil)
|
4
|
+
@savedir = savedir
|
5
|
+
begin Dir.mkdir(@savedir) # Ensure this directory exists
|
6
|
+
rescue Errno::EEXIST
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Parse the timestamp from the path to a previously saved
|
11
|
+
# arxiv xml block
|
12
|
+
def parse_dt(path)
|
13
|
+
DateTime.parse(path.split('/')[-1].split('_')[0])
|
14
|
+
end
|
15
|
+
|
16
|
+
# Download from the arXiv!
|
17
|
+
# This can be called in three potential states:
|
18
|
+
# - The savedir has yet to be populated with any xml, meaning we need to
|
19
|
+
# start a full mirror of the entire database.
|
20
|
+
#
|
21
|
+
# - The most recent xml file contains a resumptionToken, meaning the last
|
22
|
+
# harvest attempt was aborted prematurely and we need to resume.
|
23
|
+
#
|
24
|
+
# - The most recent xml file does not have a resumptionToken, in which case
|
25
|
+
# we begin a new harvest for everything since the responseDate of the last.
|
26
|
+
def sync(oai_params={})
|
27
|
+
# Find any existing xml files we may have, sorted by
|
28
|
+
# responseDate in the filename
|
29
|
+
existing = Dir.glob(File.join(@savedir, '*')).sort do |a,b|
|
30
|
+
parse_dt(a) <=> parse_dt(b)
|
31
|
+
end
|
32
|
+
|
33
|
+
if existing.empty?
|
34
|
+
puts "Commencing full arXiv download. This will take quite a while.\n" +
|
35
|
+
"Download can be safely aborted at any point and will resume from\n" +
|
36
|
+
"last successful response."
|
37
|
+
else
|
38
|
+
# Parse the most recent one
|
39
|
+
last_response = Nokogiri::XML(File.open(existing[-1]))
|
40
|
+
last_token = last_response.css('resumptionToken').text
|
41
|
+
|
42
|
+
if last_token.empty? # Previous sync completed successfully
|
43
|
+
responseDate = Date.parse(last_response.css('responseDate').text)
|
44
|
+
if responseDate == Date.today
|
45
|
+
puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest."
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
puts "Downloading from last responseDate: #{responseDate}"
|
49
|
+
oai_params[:from] ||= responseDate
|
50
|
+
else # Previous sync aborted prematurely, resume
|
51
|
+
puts "Resuming download using previous resumptionToken: #{last_token}"
|
52
|
+
oai_params = { resumptionToken: last_token }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
downloader = Downloader.new(oai_params)
|
57
|
+
downloader.start do |resp|
|
58
|
+
save_response(resp)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Parses the archive using Nokogiri's SAX parser
|
63
|
+
# Yields Paper objects as they are created
|
64
|
+
def read_metadata(&b)
|
65
|
+
parser = XMLParser.new
|
66
|
+
|
67
|
+
Dir.glob(File.join(@savedir, '*')).each do |path|
|
68
|
+
Ox.sax_parse(parser, File.open(path))
|
69
|
+
b.call(parser.papers)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Saves a timestamped OAI XML response to disk, appending
|
74
|
+
# the resumption token to the filename if available
|
75
|
+
def save_response(resp)
|
76
|
+
content = resp.doc.to_s
|
77
|
+
|
78
|
+
# Parse the response and extract some metadata
|
79
|
+
doc = Nokogiri::XML(content)
|
80
|
+
|
81
|
+
# responseDate for stamping files and potentially
|
82
|
+
# initiating the next harvest
|
83
|
+
responseDate = doc.css('responseDate').text
|
84
|
+
|
85
|
+
# Total number of records in this harvest
|
86
|
+
completeListSize = doc.css('resumptionToken').attr('completeListSize').value.to_i
|
87
|
+
# How far we are in
|
88
|
+
cursor = doc.css('resumptionToken').attr('cursor').value.to_i
|
89
|
+
# How many records we gained in this response
|
90
|
+
numRecords = doc.css('record').count.to_i
|
91
|
+
|
92
|
+
# If we have a resumption_token, stick that on the filename.
|
93
|
+
if resp.resumption_token && !resp.resumption_token.empty?
|
94
|
+
suffix = resp.resumption_token
|
95
|
+
else
|
96
|
+
suffix = 'final'
|
97
|
+
end
|
98
|
+
|
99
|
+
# Write out the file and communicate progress
|
100
|
+
filename = "#{responseDate}_#{suffix}"
|
101
|
+
f = File.open("#{@savedir}/#{filename}", 'w')
|
102
|
+
f.write(content)
|
103
|
+
f.close
|
104
|
+
puts "Saved #{cursor+numRecords}/#{completeListSize} records to #{filename}"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
data/lib/arxivsync.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'oai'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ox'
|
4
|
+
require 'arxivsync/version'
|
5
|
+
require 'arxivsync/parser'
|
6
|
+
require 'arxivsync/downloader'
|
7
|
+
require 'arxivsync/xmlarchive'
|
8
|
+
|
9
|
+
module ArxivSync
|
10
|
+
class << self
|
11
|
+
def parse_xml(xml)
|
12
|
+
parser = XMLParser.new
|
13
|
+
Ox.sax_parse(parser, StringIO.new(xml))
|
14
|
+
parser.papers
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_metadata(oai_params, &b)
|
18
|
+
downloader = Downloader.new(oai_params)
|
19
|
+
downloader.start do |resp|
|
20
|
+
b.call(resp, parse_xml(resp.doc.to_s))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|