arxivarius 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -0
- data/lib/arxivarius/version.rb +1 -1
- data/lib/arxivarius/web_source.rb +148 -0
- data/lib/arxivarius.rb +30 -5
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fd93f41d3a1a1d7e703ac7637a158b91c99eceb592adbdfa3b95577a943ce096
|
|
4
|
+
data.tar.gz: '0399895405a2e0df92c2164dce3e047c0ada3ee97e02a7342207b642ccc8c759'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 33b5fb64c8b9583bca078e5f1b612942346ce3b67334db53ee83aaf86944001f255b668e6a8c6ff897573ff13d5fd6f3b33f22cbff70e06f2c9de4ce06ce006c
|
|
7
|
+
data.tar.gz: 44729f9aef41d60f10bbb7e3f81235ac51f3cef4446d2d2bdcbb2e8d8537151b38baac0350b21c110fbde0eec2d8bca89472705f9e37ad1cd55c4e27f8cb8210
|
data/README.md
CHANGED
|
@@ -27,6 +27,24 @@ Pass any arXiv ID to `Arxivarius.get`:
|
|
|
27
27
|
paper = Arxivarius.get('2601.00470')
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
### Data sources
|
|
31
|
+
|
|
32
|
+
By default the gem queries the arXiv Atom API (`export.arxiv.org`). That host
|
|
33
|
+
has been unstable lately and sometimes responds with `429 Rate exceeded`
|
|
34
|
+
(`Arxivarius::Error::ApiError`). When it does, you can fall back to scraping the
|
|
35
|
+
public abstract page (`arxiv.org/abs/...`) instead:
|
|
36
|
+
|
|
37
|
+
```ruby
|
|
38
|
+
Arxivarius.get('2601.00470') # arXiv API (default)
|
|
39
|
+
Arxivarius.get('2601.00470', source: :api) # arXiv API, explicitly
|
|
40
|
+
Arxivarius.get('2601.00470', source: :web) # scrape the abstract page
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Both sources return the same `Paper` object, so the rest of the API below works
|
|
44
|
+
unchanged. The web source recovers every field the API does **except author
|
|
45
|
+
affiliations**, which the abstract page does not list (`author.affiliations`
|
|
46
|
+
is `[]`).
|
|
47
|
+
|
|
30
48
|
All common ID formats work:
|
|
31
49
|
|
|
32
50
|
```ruby
|
data/lib/arxivarius/version.rb
CHANGED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Arxivarius
|
|
4
|
+
# Builds a Paper by scraping the public arXiv abstract page
|
|
5
|
+
# (https://arxiv.org/abs/<id>), used as an alternative to the Atom API when
|
|
6
|
+
# it is rate limited. Reads the Highwire `citation_*` <meta> tags plus a few
|
|
7
|
+
# body elements. Every field the API exposes is recovered except author
|
|
8
|
+
# affiliations, which are not present on the abstract page.
|
|
9
|
+
module WebSource
|
|
10
|
+
ABS_URL = 'https://arxiv.org/abs/'
|
|
11
|
+
PDF_URL = 'https://arxiv.org/pdf/'
|
|
12
|
+
USER_AGENT = "arxivarius/#{Arxivarius::VERSION} " \
|
|
13
|
+
'(+https://github.com/antlypls/arxivarius)'.freeze
|
|
14
|
+
|
|
15
|
+
class << self
|
|
16
|
+
def fetch(id)
|
|
17
|
+
doc = ::Nokogiri::HTML(fetch_html(id))
|
|
18
|
+
|
|
19
|
+
# No citation_title means the page is not an abstract page (e.g. an
|
|
20
|
+
# arXiv "identifier not recognized" page served with a 200 status).
|
|
21
|
+
return nil unless meta(doc, 'citation_title')
|
|
22
|
+
|
|
23
|
+
build_paper(doc)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def fetch_html(id)
|
|
29
|
+
url = URI("#{ABS_URL}#{id}")
|
|
30
|
+
response = Net::HTTP.get_response(url, 'User-Agent' => USER_AGENT)
|
|
31
|
+
|
|
32
|
+
return nil if response.is_a?(Net::HTTPNotFound)
|
|
33
|
+
|
|
34
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
35
|
+
message = "ArXiv returned #{response.code}: #{response.body&.strip}"
|
|
36
|
+
raise Arxivarius::Error::ApiError, message
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
response.body
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def build_paper(doc)
|
|
43
|
+
Arxivarius::Paper.new.tap do |paper|
|
|
44
|
+
apply_metadata(paper, doc)
|
|
45
|
+
paper.created_at, paper.updated_at = submission_dates(doc)
|
|
46
|
+
apply_associations(paper, doc)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def apply_metadata(paper, doc)
|
|
51
|
+
paper.arxiv_url = meta_property(doc, 'og:url')
|
|
52
|
+
paper.title = squish(meta(doc, 'citation_title'))
|
|
53
|
+
paper.summary = squish(meta(doc, 'citation_abstract'))
|
|
54
|
+
paper.comment = comment(doc)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def apply_associations(paper, doc)
|
|
58
|
+
paper.authors = authors(doc)
|
|
59
|
+
paper.categories = categories(doc)
|
|
60
|
+
paper.primary_category = primary_category(doc)
|
|
61
|
+
paper.links = links(paper.arxiv_url)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# arXiv lists authors as "Last, First" in citation_author; reorder to
|
|
65
|
+
# "First Last" so they match the Atom API output exactly.
|
|
66
|
+
def authors(doc)
|
|
67
|
+
meta_all(doc, 'citation_author').map do |raw|
|
|
68
|
+
last, first = raw.split(',', 2).map { |part| squish(part) }
|
|
69
|
+
name = first ? "#{first} #{last}" : last
|
|
70
|
+
# Affiliations are not on the abstract page; match the API's empty
|
|
71
|
+
# list rather than leaving them nil.
|
|
72
|
+
Arxivarius::Author.new.tap do |author|
|
|
73
|
+
author.name = name
|
|
74
|
+
author.affiliations = []
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def categories(doc)
|
|
80
|
+
subjects = doc.at_css('td.subjects')&.text.to_s
|
|
81
|
+
subjects.scan(/\(([^()]+)\)/).flatten.map do |code|
|
|
82
|
+
build_category(code)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def primary_category(doc)
|
|
87
|
+
text = doc.at_css('.primary-subject')&.text.to_s
|
|
88
|
+
code = text[/\(([^()]+)\)/, 1]
|
|
89
|
+
build_category(code) if code
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Synthesize the link set the abstract page does not expose structurally,
|
|
93
|
+
# so pdf_url, content_types and available_in_pdf? keep working. The PDF
|
|
94
|
+
# link is versioned to match the Atom API (e.g. .../pdf/2601.00470v1).
|
|
95
|
+
def links(arxiv_url)
|
|
96
|
+
versioned_id = arxiv_url.split('/abs/', 2).last
|
|
97
|
+
[
|
|
98
|
+
build_link("#{PDF_URL}#{versioned_id}", 'application/pdf'),
|
|
99
|
+
build_link(arxiv_url, 'text/html')
|
|
100
|
+
]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# The submission history lists every version's timestamp as
|
|
104
|
+
# "[v1] Thu, 1 Jan 2026 20:56:05 UTC". The first is when the paper was
|
|
105
|
+
# published, the last is when it was last revised.
|
|
106
|
+
def submission_dates(doc)
|
|
107
|
+
history = doc.at_css('.submission-history')&.text.to_s
|
|
108
|
+
stamps = history.scan(/\[v\d+\]\s*(.+? UTC)/).flatten
|
|
109
|
+
|
|
110
|
+
return [nil, nil] if stamps.empty?
|
|
111
|
+
|
|
112
|
+
[Time.parse(stamps.first), Time.parse(stamps.last)]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def comment(doc)
|
|
116
|
+
text = doc.at_css('td.comments')&.text
|
|
117
|
+
squish(text) if text
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def build_category(code)
|
|
121
|
+
Arxivarius::Category.new.tap { |category| category.name = code }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def build_link(url, content_type)
|
|
125
|
+
Arxivarius::Link.new.tap do |link|
|
|
126
|
+
link.url = url
|
|
127
|
+
link.content_type = content_type
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def meta(doc, name)
|
|
132
|
+
doc.at_css("meta[name='#{name}']")&.[]('content')
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def meta_all(doc, name)
|
|
136
|
+
doc.css("meta[name='#{name}']").map { |tag| tag['content'] }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def meta_property(doc, property)
|
|
140
|
+
doc.at_css("meta[property='#{property}']")&.[]('content')
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def squish(string)
|
|
144
|
+
Arxivarius::Text.squish(string)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
data/lib/arxivarius.rb
CHANGED
|
@@ -14,11 +14,13 @@ require 'arxivarius/author'
|
|
|
14
14
|
require 'arxivarius/link'
|
|
15
15
|
require 'arxivarius/category'
|
|
16
16
|
require 'arxivarius/paper'
|
|
17
|
+
require 'arxivarius/web_source'
|
|
17
18
|
|
|
18
19
|
module Arxivarius
|
|
19
20
|
module Error
|
|
20
21
|
class PaperNotFound < StandardError; end
|
|
21
22
|
class MalformedId < StandardError; end
|
|
23
|
+
class ApiError < StandardError; end
|
|
22
24
|
end
|
|
23
25
|
|
|
24
26
|
# ArXiv uses two ID formats:
|
|
@@ -31,18 +33,16 @@ module Arxivarius
|
|
|
31
33
|
ID_FORMAT = /^#{CURRENT_URL_FORMAT}/
|
|
32
34
|
|
|
33
35
|
class << self
|
|
34
|
-
def get(identifier)
|
|
36
|
+
def get(identifier, source: :api)
|
|
35
37
|
id = parse_arxiv_identifier(identifier)
|
|
36
38
|
|
|
37
39
|
raise Arxivarius::Error::MalformedId, 'Paper ID format is invalid' unless valid_id?(id)
|
|
38
40
|
|
|
39
41
|
id = normalize_legacy_id(id)
|
|
40
42
|
|
|
41
|
-
|
|
42
|
-
response = ::Nokogiri::XML(Net::HTTP.get(url)).remove_namespaces!
|
|
43
|
-
paper = Arxivarius::Paper.parse(response.to_s, single: true)
|
|
43
|
+
paper = fetch_paper(id, source)
|
|
44
44
|
|
|
45
|
-
# Paper is nil when the
|
|
45
|
+
# Paper is nil when the source returns no entry for the given ID.
|
|
46
46
|
raise Arxivarius::Error::PaperNotFound, "Paper #{id} doesn't exist on arXiv" unless paper&.title
|
|
47
47
|
|
|
48
48
|
paper
|
|
@@ -50,6 +50,19 @@ module Arxivarius
|
|
|
50
50
|
|
|
51
51
|
private
|
|
52
52
|
|
|
53
|
+
def fetch_paper(id, source)
|
|
54
|
+
case source
|
|
55
|
+
when :api then fetch_via_api(id)
|
|
56
|
+
when :web then WebSource.fetch(id)
|
|
57
|
+
else raise ArgumentError, "Unknown source: #{source.inspect}"
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def fetch_via_api(id)
|
|
62
|
+
response = ::Nokogiri::XML(fetch_xml(id)).remove_namespaces!
|
|
63
|
+
Arxivarius::Paper.parse(response.to_s, single: true)
|
|
64
|
+
end
|
|
65
|
+
|
|
53
66
|
def parse_arxiv_identifier(identifier)
|
|
54
67
|
if valid_url?(identifier)
|
|
55
68
|
format = legacy_url?(identifier) ? LEGACY_URL_FORMAT : CURRENT_URL_FORMAT
|
|
@@ -71,6 +84,18 @@ module Arxivarius
|
|
|
71
84
|
identifier.match?(LEGACY_URL_FORMAT)
|
|
72
85
|
end
|
|
73
86
|
|
|
87
|
+
def fetch_xml(id)
|
|
88
|
+
url = URI("https://export.arxiv.org/api/query?id_list=#{id}")
|
|
89
|
+
response = Net::HTTP.get_response(url)
|
|
90
|
+
|
|
91
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
92
|
+
message = "ArXiv API returned #{response.code}: #{response.body&.strip}"
|
|
93
|
+
raise Arxivarius::Error::ApiError, message
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
response.body
|
|
97
|
+
end
|
|
98
|
+
|
|
74
99
|
# The arXiv API no longer resolves subcategory legacy IDs.
|
|
75
100
|
# Strips the subcategory: math.DG/0510097 -> math/0510097.
|
|
76
101
|
def normalize_legacy_id(id)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: arxivarius
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- antlypls
|
|
@@ -55,6 +55,7 @@ files:
|
|
|
55
55
|
- lib/arxivarius/paper.rb
|
|
56
56
|
- lib/arxivarius/text.rb
|
|
57
57
|
- lib/arxivarius/version.rb
|
|
58
|
+
- lib/arxivarius/web_source.rb
|
|
58
59
|
homepage: https://github.com/antlypls/arxivarius
|
|
59
60
|
licenses:
|
|
60
61
|
- MIT
|
|
@@ -73,7 +74,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
73
74
|
- !ruby/object:Gem::Version
|
|
74
75
|
version: '0'
|
|
75
76
|
requirements: []
|
|
76
|
-
rubygems_version: 4.0.
|
|
77
|
+
rubygems_version: 4.0.8
|
|
77
78
|
specification_version: 4
|
|
78
79
|
summary: Fetch and parse papers metadata from arXiv
|
|
79
80
|
test_files: []
|