arxivsync 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/lib/arxivsync/downloader.rb +6 -0
- data/lib/arxivsync/parser.rb +25 -2
- data/lib/arxivsync/version.rb +1 -1
- data/lib/arxivsync/xmlarchive.rb +1 -5
- data/test/parser.rb +11 -3
- metadata +2 -2
data/README.md
CHANGED
data/lib/arxivsync/downloader.rb
CHANGED
@@ -2,6 +2,12 @@ module ArxivSync
|
|
2
2
|
class Downloader
|
3
3
|
def initialize(initial_params={})
|
4
4
|
@initial_params = initial_params
|
5
|
+
|
6
|
+
if @initial_params[:from] == Date.today
|
7
|
+
puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest."
|
8
|
+
return false
|
9
|
+
end
|
10
|
+
|
5
11
|
unless @initial_params[:resumptionToken]
|
6
12
|
@initial_params[:metadataPrefix] ||= 'arXiv'
|
7
13
|
end
|
data/lib/arxivsync/parser.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
module ArxivSync
|
2
|
+
# Layout reference: http://www.xmlns.me/?op=visualize&id=643
|
2
3
|
Author = Struct.new(
|
3
|
-
:keyname, :forenames
|
4
|
+
:keyname, :forenames, :suffix, :affiliation
|
4
5
|
)
|
5
6
|
|
6
7
|
Paper = Struct.new(
|
7
|
-
:id, :created, :updated, :
|
8
|
+
:id, :created, :updated, :authors, :title,
|
9
|
+
:msc_class, :report_no, :journal_ref, :comments,
|
10
|
+
:abstract, :categories, :doi, :proxy, :license,
|
11
|
+
|
8
12
|
:primary_category, :crosslists
|
9
13
|
)
|
10
14
|
|
@@ -38,15 +42,34 @@ module ArxivSync
|
|
38
42
|
@model.updated = Date.parse(str)
|
39
43
|
when :title
|
40
44
|
@model.title = clean(str)
|
45
|
+
when :"msc-class"
|
46
|
+
@model.msc_class = str
|
47
|
+
when :"report-no"
|
48
|
+
@model.report_no = str
|
49
|
+
when :"journal-ref"
|
50
|
+
@model.journal_ref = str
|
51
|
+
when :comments
|
52
|
+
@model.comments = clean(str)
|
41
53
|
when :abstract
|
42
54
|
@model.abstract = clean(str)
|
43
55
|
when :categories
|
56
|
+
@model.categories = str.split
|
44
57
|
@model.primary_category = str.split[0]
|
45
58
|
@model.crosslists = str.split.drop(1)
|
59
|
+
when :doi
|
60
|
+
@model.doi = str
|
61
|
+
when :proxy
|
62
|
+
@model.proxy = str
|
63
|
+
when :license
|
64
|
+
@model.license = str
|
46
65
|
when :keyname
|
47
66
|
@author.keyname = str
|
48
67
|
when :forenames
|
49
68
|
@author.forenames = str
|
69
|
+
when :suffix
|
70
|
+
@author.suffix = str
|
71
|
+
when :affiliation
|
72
|
+
@author.affiliation = str
|
50
73
|
end
|
51
74
|
end
|
52
75
|
|
data/lib/arxivsync/version.rb
CHANGED
data/lib/arxivsync/xmlarchive.rb
CHANGED
@@ -41,12 +41,8 @@ module ArxivSync
|
|
41
41
|
|
42
42
|
if last_token.empty? # Previous sync completed successfully
|
43
43
|
responseDate = Date.parse(last_response.css('responseDate').text)
|
44
|
-
if responseDate == Date.today
|
45
|
-
puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest."
|
46
|
-
return false
|
47
|
-
end
|
48
44
|
puts "Downloading from last responseDate: #{responseDate}"
|
49
|
-
oai_params[:from]
|
45
|
+
oai_params[:from] = responseDate
|
50
46
|
else # Previous sync aborted prematurely, resume
|
51
47
|
puts "Resuming download using previous resumptionToken: #{last_token}"
|
52
48
|
oai_params = { resumptionToken: last_token }
|
data/test/parser.rb
CHANGED
@@ -6,7 +6,7 @@ TEST_ROOT = File.dirname(__FILE__)
|
|
6
6
|
class TestParser < Minitest::Test
|
7
7
|
def test_parser
|
8
8
|
archive = ArxivSync::XMLArchive.new(File.join(TEST_ROOT, 'fixtures'))
|
9
|
-
|
9
|
+
tested = 0
|
10
10
|
archive.read_metadata do |papers|
|
11
11
|
assert_equal papers.count, 1000
|
12
12
|
papers.each do |paper|
|
@@ -14,16 +14,24 @@ class TestParser < Minitest::Test
|
|
14
14
|
assert_equal paper.created, Date.parse("2013-02-04")
|
15
15
|
assert_equal paper.updated, nil
|
16
16
|
assert_equal paper.title, "Correlation effects in the electronic structure of the Ni-based superconducting KNi2S2"
|
17
|
+
assert_equal paper.license, "http://creativecommons.org/licenses/by/3.0/"
|
17
18
|
assert_equal paper.primary_category, "cond-mat.supr-con"
|
18
19
|
assert_equal paper.crosslists, []
|
19
20
|
assert_includes paper.abstract, "using Gutzwiller approximation method."
|
20
21
|
assert_equal paper.authors.map(&:keyname), ["Lu", "Wang", "Xie", "Zhang"]
|
21
22
|
assert_equal paper.authors.map(&:forenames), ["Feng", "Wei-Hua", "Xinjian", "Fu-Chun"]
|
22
|
-
|
23
|
+
tested += 1
|
24
|
+
end
|
25
|
+
|
26
|
+
if paper.id == "1302.0758"
|
27
|
+
assert_equal paper.authors[0].affiliation, "Baylor University, Waco, TX, USA"
|
28
|
+
assert_equal paper.comments, "8 pages, 4 figures; presented by BFLW at ICHEP 2012"
|
29
|
+
assert_equal paper.report_no, "BU-HEPP-12-05"
|
30
|
+
tested += 1
|
23
31
|
end
|
24
32
|
end
|
25
33
|
end
|
26
34
|
|
27
|
-
|
35
|
+
assert_equal tested, 2
|
28
36
|
end
|
29
37
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxivsync
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
12
|
+
date: 2013-06-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|