arxivsync 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in arxivsync.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jaiden Mispy
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,53 @@
1
+ # arxivsync-0.0.3
2
+
3
+ Ruby OAI interface for harvesting the arXiv. Can be used to store and update an XML mirror of paper metadata, and parse the XML into Ruby objects to allow conversion into a friendlier format.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ gem install arxivsync
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### Creating or updating an archive
14
+
15
+ ```
16
+ archive = ArxivSync::XMLArchive.new("/home/foo/savedir")
17
+ archive.sync
18
+ ```
19
+
20
+ Stores each XML response as an individual file, each containing up to 1000 records. Following an initial harvest, you can rerun this to add additional files containing all records since the last harvest.
21
+
22
+ Remember to leave at least a day between syncs-- the temporal granularity doesn't go any smaller than that!
23
+
24
+ ### Reading from an archive
25
+
26
+ ```
27
+ archive = ArxivSync::XMLArchive.new("/home/foo/savedir")
28
+ archive.read_metadata do |paper|
29
+ # Do stuff with paper
30
+ end
31
+ ```
32
+
33
+ Parses the XML files using Nokogiri's SAX parser and yields Structs representing the metadata as it goes.
34
+
35
+ ### Download and parse immediately
36
+
37
+ If you just want arxivsync to do the request-cycle and parsing bits but handle storage yourself:
38
+
39
+ ```
40
+ ArxivSync.get_metadata(oai_params) do |resp, papers|
41
+ papers.each do |paper|
42
+ # Do stuff with paper
43
+ end
44
+ end
45
+ ```
46
+
47
+ ## Contributing
48
+
49
+ 1. Fork it
50
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
51
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
52
+ 4. Push to the branch (`git push origin my-new-feature`)
53
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/arxivsync.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'arxivsync/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "arxivsync"
8
+ spec.version = ArxivSync::VERSION
9
+ spec.authors = ["Jaiden Mispy"]
10
+ spec.email = ["scirate@mispy.me"]
11
+ spec.description = %q{OAI interface for harvesting the arXiv database}
12
+ spec.summary = %q{OAI interface for harvesting the arXiv database}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "minitest"
24
+
25
+ spec.add_runtime_dependency "oai"
26
+ spec.add_runtime_dependency "nokogiri"
27
+ spec.add_runtime_dependency "ox", ">= 2.0.2"
28
+ end
@@ -0,0 +1,57 @@
1
+ module ArxivSync
2
+ class Downloader
3
+ def initialize(initial_params={})
4
+ @initial_params = initial_params
5
+ unless @initial_params[:resumptionToken]
6
+ @initial_params[:metadataPrefix] ||= 'arXiv'
7
+ end
8
+ @last_params = nil
9
+
10
+ @oai = OAI::Client.new('http://export.arxiv.org/oai2', :parser => 'libxml')
11
+ end
12
+
13
+ def start(&b)
14
+ # Make the initial request
15
+ resp = make_request(@initial_params)
16
+
17
+ # Continue to make requests until the server stops sending
18
+ # resumption tokens
19
+ while true
20
+ if !resp.resumption_token || resp.resumption_token.empty?
21
+ if resp.doc.to_s.include?("Retry after 20 seconds") # Rate limitation
22
+ puts "Honoring 503 and sleeping for 20 seconds..."
23
+ sleep 20
24
+ resp = retry_request
25
+ else # No resumption_token and no retry should mean we're finished
26
+ b.call(resp)
27
+ puts "Finished archiving!"
28
+ break
29
+ end
30
+ else # We have a resumption_token, keep going!
31
+ b.call(resp)
32
+ resp = make_request(resumptionToken: resp.resumption_token)
33
+ end
34
+ end
35
+
36
+ return self
37
+ end
38
+
39
+ def retry_request
40
+ make_request(@last_params)
41
+ end
42
+
43
+ def make_request(params)
44
+ puts "Making OAI request with params: #{params.inspect}"
45
+
46
+ @last_params = params.clone # list_records will nuke our params
47
+
48
+ begin
49
+ return @oai.list_records(params)
50
+ rescue Faraday::Error::TimeoutError
51
+ puts "Request timed out; retrying in 20 seconds"
52
+ sleep 20
53
+ return retry_request
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,67 @@
1
+ module ArxivSync
2
+ Author = Struct.new(
3
+ :keyname, :forenames
4
+ )
5
+
6
+ Paper = Struct.new(
7
+ :id, :created, :updated, :title, :abstract, :authors,
8
+ :primary_category, :crosslists
9
+ )
10
+
11
+ class XMLParser < ::Ox::Sax
12
+ attr_accessor :papers
13
+
14
+ def start_element(name, attributes=[])
15
+ @el = name
16
+ case name
17
+ when :ListRecords
18
+ @papers = []
19
+ when :metadata
20
+ @model = Paper.new
21
+ @authors = []
22
+ when :author
23
+ @author = Author.new
24
+ end
25
+ end
26
+
27
+ def clean(str)
28
+ str.gsub(/\s+/, ' ').strip
29
+ end
30
+
31
+ def text(str)
32
+ case @el
33
+ when :id
34
+ @model.id = str
35
+ when :created
36
+ @model.created = Date.parse(str)
37
+ when :updated
38
+ @model.updated = Date.parse(str)
39
+ when :title
40
+ @model.title = clean(str)
41
+ when :abstract
42
+ @model.abstract = clean(str)
43
+ when :categories
44
+ @model.primary_category = str.split[0]
45
+ @model.crosslists = str.split.drop(1)
46
+ when :keyname
47
+ @author.keyname = str
48
+ when :forenames
49
+ @author.forenames = str
50
+ end
51
+ end
52
+
53
+ def end_element(name)
54
+ case name
55
+ when :author
56
+ @authors.push(@author)
57
+ when :metadata # End of a paper entry
58
+ #@paper.updated_date ||= @paper.pubdate # If no separate updated date
59
+ #@paper.feed_id = Feed.get_or_create(@primary_category).id
60
+ @model.authors = @authors
61
+
62
+ @papers.push(@model)
63
+ end
64
+ @el = nil
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,3 @@
1
+ module ArxivSync
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,107 @@
1
+ module ArxivSync
2
+ class XMLArchive
3
+ def initialize(savedir, custom_params=nil)
4
+ @savedir = savedir
5
+ begin Dir.mkdir(@savedir) # Ensure this directory exists
6
+ rescue Errno::EEXIST
7
+ end
8
+ end
9
+
10
+ # Parse the timestamp from the path to a previously saved
11
+ # arxiv xml block
12
+ def parse_dt(path)
13
+ DateTime.parse(path.split('/')[-1].split('_')[0])
14
+ end
15
+
16
+ # Download from the arXiv!
17
+ # This can be called in three potential states:
18
+ # - The savedir has yet to be populated with any xml, meaning we need to
19
+ # start a full mirror of the entire database.
20
+ #
21
+ # - The most recent xml file contains a resumptionToken, meaning the last
22
+ # harvest attempt was aborted prematurely and we need to resume.
23
+ #
24
+ # - The most recent xml file does not have a resumptionToken, in which case
25
+ # we begin a new harvest for everything since the responseDate of the last.
26
+ def sync(oai_params={})
27
+ # Find any existing xml files we may have, sorted by
28
+ # responseDate in the filename
29
+ existing = Dir.glob(File.join(@savedir, '*')).sort do |a,b|
30
+ parse_dt(a) <=> parse_dt(b)
31
+ end
32
+
33
+ if existing.empty?
34
+ puts "Commencing full arXiv download. This will take quite a while.\n" +
35
+ "Download can be safely aborted at any point and will resume from\n" +
36
+ "last successful response."
37
+ else
38
+ # Parse the most recent one
39
+ last_response = Nokogiri::XML(File.open(existing[-1]))
40
+ last_token = last_response.css('resumptionToken').text
41
+
42
+ if last_token.empty? # Previous sync completed successfully
43
+ responseDate = Date.parse(last_response.css('responseDate').text)
44
+ if responseDate == Date.today
45
+ puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest."
46
+ return false
47
+ end
48
+ puts "Downloading from last responseDate: #{responseDate}"
49
+ oai_params[:from] ||= responseDate
50
+ else # Previous sync aborted prematurely, resume
51
+ puts "Resuming download using previous resumptionToken: #{last_token}"
52
+ oai_params = { resumptionToken: last_token }
53
+ end
54
+ end
55
+
56
+ downloader = Downloader.new(oai_params)
57
+ downloader.start do |resp|
58
+ save_response(resp)
59
+ end
60
+ end
61
+
62
+ # Parses the archive using Nokogiri's SAX parser
63
+ # Yields Paper objects as they are created
64
+ def read_metadata(&b)
65
+ parser = XMLParser.new
66
+
67
+ Dir.glob(File.join(@savedir, '*')).each do |path|
68
+ Ox.sax_parse(parser, File.open(path))
69
+ b.call(parser.papers)
70
+ end
71
+ end
72
+
73
+ # Saves a timestamped OAI XML response to disk, appending
74
+ # the resumption token to the filename if available
75
+ def save_response(resp)
76
+ content = resp.doc.to_s
77
+
78
+ # Parse the response and extract some metadata
79
+ doc = Nokogiri::XML(content)
80
+
81
+ # responseDate for stamping files and potentially
82
+ # initiating the next harvest
83
+ responseDate = doc.css('responseDate').text
84
+
85
+ # Total number of records in this harvest
86
+ completeListSize = doc.css('resumptionToken').attr('completeListSize').value.to_i
87
+ # How far we are in
88
+ cursor = doc.css('resumptionToken').attr('cursor').value.to_i
89
+ # How many records we gained in this response
90
+ numRecords = doc.css('record').count.to_i
91
+
92
+ # If we have a resumption_token, stick that on the filename.
93
+ if resp.resumption_token && !resp.resumption_token.empty?
94
+ suffix = resp.resumption_token
95
+ else
96
+ suffix = 'final'
97
+ end
98
+
99
+ # Write out the file and communicate progress
100
+ filename = "#{responseDate}_#{suffix}"
101
+ f = File.open("#{@savedir}/#{filename}", 'w')
102
+ f.write(content)
103
+ f.close
104
+ puts "Saved #{cursor+numRecords}/#{completeListSize} records to #{filename}"
105
+ end
106
+ end
107
+ end
data/lib/arxivsync.rb ADDED
@@ -0,0 +1,24 @@
1
+ require 'oai'
2
+ require 'nokogiri'
3
+ require 'ox'
4
+ require 'arxivsync/version'
5
+ require 'arxivsync/parser'
6
+ require 'arxivsync/downloader'
7
+ require 'arxivsync/xmlarchive'
8
+
9
+ module ArxivSync
10
+ class << self
11
+ def parse_xml(xml)
12
+ parser = XMLParser.new
13
+ Ox.sax_parse(parser, StringIO.new(xml))
14
+ parser.papers
15
+ end
16
+
17
+ def get_metadata(oai_params, &b)
18
+ downloader = Downloader.new(oai_params)
19
+ downloader.start do |resp|
20
+ b.call(resp, parse_xml(resp.doc.to_s))
21
+ end
22
+ end
23
+ end
24
+ end