arxivsync 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +1 -0
- data/README.md +14 -10
- data/Rakefile +5 -0
- data/arxivsync.gemspec +4 -2
- data/bin/arxivsync +28 -0
- data/lib/arxivsync/downloader.rb +6 -6
- data/lib/arxivsync/parser.rb +51 -41
- data/lib/arxivsync/version.rb +1 -1
- data/lib/arxivsync/xmlarchive.rb +19 -9
- data/lib/arxivsync.rb +1 -0
- data/test/fixtures/47001 +23613 -0
- data/test/parser_test.rb +55 -0
- data/test/test_helper.rb +3 -0
- metadata +58 -40
- data/test/fixtures/406001.xml +0 -23932
- data/test/parser.rb +0 -37
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 51a06a56a4190a2205f27edb93153fe48b4fd361
|
4
|
+
data.tar.gz: c6b1420f327fd406332b0af246fa905bd746aa5d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e6d87c40b4a3659930dfa316247a95d3a6c70c2ef6d98c839f158e07ff414499b409556507d547e67ef13a70078ca00bbd49b345a457bfaff0ff7b8c6d98594f
|
7
|
+
data.tar.gz: c5aaa89769f8b0efe19cdcfd18ac8e7a92d5e162f299614372840573f2505fb09e8872a7527434ca05a08db91e072414583b3794cacebadafa3cce0fc1c6c830
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# arxivsync
|
1
|
+
# arxivsync 0.0.6
|
2
2
|
|
3
3
|
Ruby OAI interface for harvesting the arXiv. Can be used to store and update an XML mirror of paper metadata, and parse the XML into Ruby objects to allow conversion into a friendlier format.
|
4
4
|
|
@@ -12,31 +12,35 @@ Ruby OAI interface for harvesting the arXiv. Can be used to store and update an
|
|
12
12
|
|
13
13
|
### Creating or updating an archive
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
Use the included shell command:
|
16
|
+
|
17
|
+
```bash
|
18
|
+
arxivsync ARCHIVE_DIR
|
18
19
|
```
|
19
20
|
|
20
|
-
|
21
|
+
This stores each XML response as an individual file, each containing up to 1000 records. Following an initial harvest, you can rerun this to add additional files containing all records since the last harvest.
|
21
22
|
|
22
23
|
Remember to leave at least a day between syncs-- the temporal granularity doesn't go any smaller than that!
|
23
24
|
|
24
25
|
### Reading from an archive
|
25
26
|
|
26
|
-
```
|
27
|
+
```ruby
|
27
28
|
archive = ArxivSync::XMLArchive.new("/home/foo/savedir")
|
28
|
-
archive.read_metadata do |
|
29
|
-
#
|
29
|
+
archive.read_metadata do |papers|
|
30
|
+
# Papers come in blocks of at most 1000 at a time
|
31
|
+
papers.each do |paper|
|
32
|
+
# Do stuff with papers
|
33
|
+
end
|
30
34
|
end
|
31
35
|
```
|
32
36
|
|
33
|
-
Parses the XML files using
|
37
|
+
Parses the XML files using a SAX parser and yields Structs representing the metadata as it goes. The structures returned will closely match the [arxivRaw](http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw) format.
|
34
38
|
|
35
39
|
### Download and parse immediately
|
36
40
|
|
37
41
|
If you just want arxivsync to do the request-cycle and parsing bits but handle storage yourself:
|
38
42
|
|
39
|
-
```
|
43
|
+
```ruby
|
40
44
|
ArxivSync.get_metadata(oai_params) do |resp, papers|
|
41
45
|
papers.each do |paper|
|
42
46
|
# Do stuff with paper
|
data/Rakefile
CHANGED
data/arxivsync.gemspec
CHANGED
@@ -21,8 +21,10 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
23
|
spec.add_development_dependency "minitest"
|
24
|
+
spec.add_development_dependency "minitest-reporters"
|
24
25
|
|
25
26
|
spec.add_runtime_dependency "oai"
|
26
|
-
spec.add_runtime_dependency "
|
27
|
-
spec.add_runtime_dependency "ox", ">= 2.0.2"
|
27
|
+
spec.add_runtime_dependency "colorize"
|
28
|
+
spec.add_runtime_dependency "ox", ">= 2.0.2" # Super-fast XML parser
|
29
|
+
spec.add_runtime_dependency "nokogiri" # Slower but more accurate parser
|
28
30
|
end
|
data/bin/arxivsync
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'arxivsync'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
def arxivsync
|
7
|
+
optparse = OptionParser.new do |opts|
|
8
|
+
opts.banner = %Q{Usage: arxivsync ARCHIVE_DIR
|
9
|
+
|
10
|
+
Begin or resume syncing a full copy of the arXiv
|
11
|
+
metadata to ARCHIVE_DIR. The directory will be created
|
12
|
+
if it does not exist.}
|
13
|
+
end
|
14
|
+
|
15
|
+
optparse.parse!
|
16
|
+
|
17
|
+
if ARGV.empty?
|
18
|
+
puts optparse
|
19
|
+
exit 1
|
20
|
+
end
|
21
|
+
|
22
|
+
archive = ArxivSync::XMLArchive.new(ARGV[0])
|
23
|
+
archive.sync
|
24
|
+
end
|
25
|
+
|
26
|
+
if __FILE__ == $0
|
27
|
+
arxivsync
|
28
|
+
end
|
data/lib/arxivsync/downloader.rb
CHANGED
@@ -4,12 +4,12 @@ module ArxivSync
|
|
4
4
|
@initial_params = initial_params
|
5
5
|
|
6
6
|
if @initial_params[:from] == Date.today
|
7
|
-
puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest."
|
7
|
+
puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest.".light_yellow
|
8
8
|
return false
|
9
9
|
end
|
10
10
|
|
11
11
|
unless @initial_params[:resumptionToken]
|
12
|
-
@initial_params[:metadataPrefix] ||= '
|
12
|
+
@initial_params[:metadataPrefix] ||= 'arXivRaw'
|
13
13
|
end
|
14
14
|
@last_params = nil
|
15
15
|
|
@@ -25,12 +25,12 @@ module ArxivSync
|
|
25
25
|
while true
|
26
26
|
if !resp.resumption_token || resp.resumption_token.empty?
|
27
27
|
if resp.doc.to_s.include?("Retry after 20 seconds") # Rate limitation
|
28
|
-
puts "Honoring 503 and sleeping for 20 seconds..."
|
28
|
+
puts "Honoring 503 and sleeping for 20 seconds...".light_yellow
|
29
29
|
sleep 20
|
30
30
|
resp = retry_request
|
31
31
|
else # No resumption_token and no retry should mean we're finished
|
32
32
|
b.call(resp)
|
33
|
-
puts "Finished archiving
|
33
|
+
puts "Finished archiving~!".bold.light_green
|
34
34
|
break
|
35
35
|
end
|
36
36
|
else # We have a resumption_token, keep going!
|
@@ -47,14 +47,14 @@ module ArxivSync
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def make_request(params)
|
50
|
-
puts "Making OAI request with params: #{params.inspect}"
|
50
|
+
puts "Making OAI request with params: #{params.inspect}".light_magenta
|
51
51
|
|
52
52
|
@last_params = params.clone # list_records will nuke our params
|
53
53
|
|
54
54
|
begin
|
55
55
|
return @oai.list_records(params)
|
56
56
|
rescue Faraday::Error::TimeoutError
|
57
|
-
puts "Request timed out; retrying in 20 seconds"
|
57
|
+
puts "Request timed out; retrying in 20 seconds".light_yellow
|
58
58
|
sleep 20
|
59
59
|
return retry_request
|
60
60
|
end
|
data/lib/arxivsync/parser.rb
CHANGED
@@ -1,15 +1,27 @@
|
|
1
1
|
module ArxivSync
|
2
2
|
# Layout reference: http://www.xmlns.me/?op=visualize&id=643
|
3
|
-
|
4
|
-
:
|
3
|
+
Version = Struct.new(
|
4
|
+
:date, # Time.parse("Wed, 23 Jan 2008 21:06:41 GMT")
|
5
|
+
:size # '121kb'
|
5
6
|
)
|
6
7
|
|
7
8
|
Paper = Struct.new(
|
8
|
-
:id,
|
9
|
-
:
|
10
|
-
:
|
9
|
+
:id, # '0801.3673'
|
10
|
+
:submitter, # 'N. C. Bacalis'
|
11
|
+
:versions,
|
12
|
+
:title, # "Variational Functionals for Excited States"
|
13
|
+
:authors, # ['Naoum C. Bacalis']
|
14
|
+
:categories, # ['quant-ph'] (primary category first, then crosslists)
|
15
|
+
:abstract, # "Functionals that have local minima at the excited..."
|
11
16
|
|
12
|
-
|
17
|
+
# These properties do not always appear
|
18
|
+
:comments, # "4 pages"
|
19
|
+
:msc_class, # '57M25'
|
20
|
+
:report_no, # "LA-UR 07-7165"
|
21
|
+
:journal_ref, # "JHEP 0805:087,2008."
|
22
|
+
:doi, # "10.1088/1126-6708/2008/05/087"
|
23
|
+
:proxy, # "ccsd hal-00214270"
|
24
|
+
:license # "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"
|
13
25
|
)
|
14
26
|
|
15
27
|
class XMLParser < ::Ox::Sax
|
@@ -22,9 +34,9 @@ module ArxivSync
|
|
22
34
|
@papers = []
|
23
35
|
when :metadata
|
24
36
|
@model = Paper.new
|
25
|
-
@
|
26
|
-
when :
|
27
|
-
@
|
37
|
+
@versions = []
|
38
|
+
when :version
|
39
|
+
@version = Version.new
|
28
40
|
end
|
29
41
|
end
|
30
42
|
|
@@ -34,53 +46,51 @@ module ArxivSync
|
|
34
46
|
|
35
47
|
def text(str)
|
36
48
|
case @el
|
49
|
+
# Necessary elements
|
37
50
|
when :id
|
38
|
-
@model.id = str
|
39
|
-
when :
|
40
|
-
@model.
|
41
|
-
when :updated
|
42
|
-
@model.updated = Date.parse(str)
|
51
|
+
@model.id = clean(str)
|
52
|
+
when :submitter
|
53
|
+
@model.submitter = clean(str)
|
43
54
|
when :title
|
44
55
|
@model.title = clean(str)
|
56
|
+
when :authors
|
57
|
+
@model.authors = clean(str).split(/,| and /)
|
58
|
+
.map { |s| clean(s) }.reject { |s| s.empty? }
|
59
|
+
when :categories
|
60
|
+
@model.categories = clean(str).split(/\s/)
|
61
|
+
when :abstract
|
62
|
+
@model.abstract = clean(str)
|
63
|
+
|
64
|
+
# Optional elements
|
65
|
+
when :comments
|
66
|
+
@model.comments = clean(str)
|
45
67
|
when :"msc-class"
|
46
|
-
@model.msc_class = str
|
68
|
+
@model.msc_class = clean(str)
|
47
69
|
when :"report-no"
|
48
|
-
@model.report_no = str
|
70
|
+
@model.report_no = clean(str)
|
49
71
|
when :"journal-ref"
|
50
|
-
@model.journal_ref = str
|
51
|
-
when :comments
|
52
|
-
@model.comments = clean(str)
|
53
|
-
when :abstract
|
54
|
-
@model.abstract = clean(str)
|
55
|
-
when :categories
|
56
|
-
@model.categories = str.split
|
57
|
-
@model.primary_category = str.split[0]
|
58
|
-
@model.crosslists = str.split.drop(1)
|
72
|
+
@model.journal_ref = clean(str)
|
59
73
|
when :doi
|
60
|
-
@model.doi = str
|
74
|
+
@model.doi = clean(str)
|
61
75
|
when :proxy
|
62
|
-
@model.proxy = str
|
76
|
+
@model.proxy = clean(str)
|
63
77
|
when :license
|
64
|
-
@model.license = str
|
65
|
-
|
66
|
-
|
67
|
-
when :
|
68
|
-
@
|
69
|
-
when :
|
70
|
-
@
|
71
|
-
when :affiliation
|
72
|
-
@author.affiliation = str
|
78
|
+
@model.license = clean(str)
|
79
|
+
|
80
|
+
# Versions
|
81
|
+
when :date
|
82
|
+
@version.date = Time.parse(clean(str))
|
83
|
+
when :size
|
84
|
+
@version.size = clean(str)
|
73
85
|
end
|
74
86
|
end
|
75
87
|
|
76
88
|
def end_element(name)
|
77
89
|
case name
|
78
|
-
when :
|
79
|
-
@
|
90
|
+
when :version
|
91
|
+
@versions.push(@version)
|
80
92
|
when :metadata # End of a paper entry
|
81
|
-
|
82
|
-
#@paper.feed_id = Feed.get_or_create(@primary_category).id
|
83
|
-
@model.authors = @authors
|
93
|
+
@model.versions = @versions
|
84
94
|
|
85
95
|
@papers.push(@model)
|
86
96
|
end
|
data/lib/arxivsync/version.rb
CHANGED
data/lib/arxivsync/xmlarchive.rb
CHANGED
@@ -1,16 +1,25 @@
|
|
1
1
|
module ArxivSync
|
2
2
|
class XMLArchive
|
3
3
|
def initialize(savedir, custom_params=nil)
|
4
|
-
@savedir = savedir
|
5
|
-
|
6
|
-
|
4
|
+
@savedir = File.expand_path(savedir)
|
5
|
+
|
6
|
+
if not Dir.exists?(@savedir)
|
7
|
+
puts "Creating new XML archive at #{@savedir}\n".light_green
|
8
|
+
Dir.mkdir(@savedir)
|
7
9
|
end
|
8
10
|
end
|
9
11
|
|
10
12
|
# Parse the timestamp from the path to a previously saved
|
11
13
|
# arxiv xml block
|
12
14
|
def parse_dt(path)
|
13
|
-
|
15
|
+
begin
|
16
|
+
DateTime.parse(path.split('/')[-1].split('_')[0])
|
17
|
+
rescue ArgumentError
|
18
|
+
puts "Failed to parse timestamp from file #{path}\n".bold.light_red
|
19
|
+
puts ("Are you sure this is an archive directory?\n" +
|
20
|
+
"If so, it needs to be free of strange interloping files.").bold.light_white
|
21
|
+
exit 1
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
# Download from the arXiv!
|
@@ -31,9 +40,10 @@ module ArxivSync
|
|
31
40
|
end
|
32
41
|
|
33
42
|
if existing.empty?
|
34
|
-
puts "Commencing full arXiv download. This will take
|
43
|
+
puts ("Commencing full arXiv download. This will take ... a while.\n" +
|
35
44
|
"Download can be safely aborted at any point and will resume from\n" +
|
36
|
-
"last successful response."
|
45
|
+
"last successful response. However, resumptionTokens *will* expire\n" +
|
46
|
+
"if you leave it in an incomplete state for long enough.\n").bold.light_white
|
37
47
|
else
|
38
48
|
# Parse the most recent one
|
39
49
|
last_response = Nokogiri::XML(File.open(existing[-1]))
|
@@ -41,10 +51,10 @@ module ArxivSync
|
|
41
51
|
|
42
52
|
if last_token.empty? # Previous sync completed successfully
|
43
53
|
responseDate = Date.parse(last_response.css('responseDate').text)
|
44
|
-
puts "Downloading from last responseDate: #{responseDate}"
|
54
|
+
puts "Downloading from last responseDate: #{responseDate}\n".bold.light_green
|
45
55
|
oai_params[:from] = responseDate
|
46
56
|
else # Previous sync aborted prematurely, resume
|
47
|
-
puts "Resuming download using previous resumptionToken: #{last_token}"
|
57
|
+
puts "Resuming download using previous resumptionToken: #{last_token}\n".bold.light_green
|
48
58
|
oai_params = { resumptionToken: last_token }
|
49
59
|
end
|
50
60
|
end
|
@@ -97,7 +107,7 @@ module ArxivSync
|
|
97
107
|
f = File.open("#{@savedir}/#{filename}", 'w')
|
98
108
|
f.write(content)
|
99
109
|
f.close
|
100
|
-
puts "Saved #{cursor+numRecords}
|
110
|
+
puts "Saved #{cursor+numRecords} of #{completeListSize} records to #{filename}".light_green
|
101
111
|
end
|
102
112
|
end
|
103
113
|
end
|