arxivarius 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ae569c1030d082bb94e10a63472ef34aeb89bb9bef56db25c57f68254dc00be8
4
- data.tar.gz: f3f664f4a8feb5b7a585166187590018ae68f7e98cd5ae290a85259baab40222
3
+ metadata.gz: fd93f41d3a1a1d7e703ac7637a158b91c99eceb592adbdfa3b95577a943ce096
4
+ data.tar.gz: '0399895405a2e0df92c2164dce3e047c0ada3ee97e02a7342207b642ccc8c759'
5
5
  SHA512:
6
- metadata.gz: f218c5e45f1f1305ab9437a205f9c35db5494bc5491af3d6d19c560e025ba452990caff7fb709fcde5e07302fe73ad8ea9e2a6f043831873cdd076c2fd8a8ccf
7
- data.tar.gz: 95396dc42e302e54b4e6ae42475bd375ee77d8f49b04ee6383e57dfb0a8d144c2cc1c59cd135ea57c043fa1be8c9fc575c8cf09240dea03179e939097415ca7b
6
+ metadata.gz: 33b5fb64c8b9583bca078e5f1b612942346ce3b67334db53ee83aaf86944001f255b668e6a8c6ff897573ff13d5fd6f3b33f22cbff70e06f2c9de4ce06ce006c
7
+ data.tar.gz: 44729f9aef41d60f10bbb7e3f81235ac51f3cef4446d2d2bdcbb2e8d8537151b38baac0350b21c110fbde0eec2d8bca89472705f9e37ad1cd55c4e27f8cb8210
data/README.md CHANGED
@@ -27,6 +27,24 @@ Pass any arXiv ID to `Arxivarius.get`:
27
27
  paper = Arxivarius.get('2601.00470')
28
28
  ```
29
29
 
30
+ ### Data sources
31
+
32
+ By default the gem queries the arXiv Atom API (`export.arxiv.org`). That host
33
+ has been unstable lately and sometimes responds with `429 Rate exceeded`
34
+ (`Arxivarius::Error::ApiError`). When it does, you can fall back to scraping the
35
+ public abstract page (`arxiv.org/abs/...`) instead:
36
+
37
+ ```ruby
38
+ Arxivarius.get('2601.00470') # arXiv API (default)
39
+ Arxivarius.get('2601.00470', source: :api) # arXiv API, explicitly
40
+ Arxivarius.get('2601.00470', source: :web) # scrape the abstract page
41
+ ```
42
+
43
+ Both sources return the same `Paper` object, so the rest of the API below works
44
+ unchanged. The web source recovers every field the API does **except author
45
+ affiliations**, which the abstract page does not list (`author.affiliations`
46
+ is `[]`).
47
+
30
48
  All common ID formats work:
31
49
 
32
50
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Arxivarius
4
- VERSION = '0.10.0'
4
+ VERSION = '0.12.0'
5
5
  end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Arxivarius
4
+ # Builds a Paper by scraping the public arXiv abstract page
5
+ # (https://arxiv.org/abs/<id>), used as an alternative to the Atom API when
6
+ # it is rate limited. Reads the Highwire `citation_*` <meta> tags plus a few
7
+ # body elements. Every field the API exposes is recovered except author
8
+ # affiliations, which are not present on the abstract page.
9
+ module WebSource
10
+ ABS_URL = 'https://arxiv.org/abs/'
11
+ PDF_URL = 'https://arxiv.org/pdf/'
12
+ USER_AGENT = "arxivarius/#{Arxivarius::VERSION} " \
13
+ '(+https://github.com/antlypls/arxivarius)'.freeze
14
+
15
+ class << self
16
+ def fetch(id)
17
+ doc = ::Nokogiri::HTML(fetch_html(id))
18
+
19
+ # No citation_title means the page is not an abstract page (e.g. an
20
+ # arXiv "identifier not recognized" page served with a 200 status).
21
+ return nil unless meta(doc, 'citation_title')
22
+
23
+ build_paper(doc)
24
+ end
25
+
26
+ private
27
+
28
+ def fetch_html(id)
29
+ url = URI("#{ABS_URL}#{id}")
30
+ response = Net::HTTP.get_response(url, 'User-Agent' => USER_AGENT)
31
+
32
+ return nil if response.is_a?(Net::HTTPNotFound)
33
+
34
+ unless response.is_a?(Net::HTTPSuccess)
35
+ message = "ArXiv returned #{response.code}: #{response.body&.strip}"
36
+ raise Arxivarius::Error::ApiError, message
37
+ end
38
+
39
+ response.body
40
+ end
41
+
42
+ def build_paper(doc)
43
+ Arxivarius::Paper.new.tap do |paper|
44
+ apply_metadata(paper, doc)
45
+ paper.created_at, paper.updated_at = submission_dates(doc)
46
+ apply_associations(paper, doc)
47
+ end
48
+ end
49
+
50
+ def apply_metadata(paper, doc)
51
+ paper.arxiv_url = meta_property(doc, 'og:url')
52
+ paper.title = squish(meta(doc, 'citation_title'))
53
+ paper.summary = squish(meta(doc, 'citation_abstract'))
54
+ paper.comment = comment(doc)
55
+ end
56
+
57
+ def apply_associations(paper, doc)
58
+ paper.authors = authors(doc)
59
+ paper.categories = categories(doc)
60
+ paper.primary_category = primary_category(doc)
61
+ paper.links = links(paper.arxiv_url)
62
+ end
63
+
64
+ # arXiv lists authors as "Last, First" in citation_author; reorder to
65
+ # "First Last" so they match the Atom API output exactly.
66
+ def authors(doc)
67
+ meta_all(doc, 'citation_author').map do |raw|
68
+ last, first = raw.split(',', 2).map { |part| squish(part) }
69
+ name = first ? "#{first} #{last}" : last
70
+ # Affiliations are not on the abstract page; match the API's empty
71
+ # list rather than leaving them nil.
72
+ Arxivarius::Author.new.tap do |author|
73
+ author.name = name
74
+ author.affiliations = []
75
+ end
76
+ end
77
+ end
78
+
79
+ def categories(doc)
80
+ subjects = doc.at_css('td.subjects')&.text.to_s
81
+ subjects.scan(/\(([^()]+)\)/).flatten.map do |code|
82
+ build_category(code)
83
+ end
84
+ end
85
+
86
+ def primary_category(doc)
87
+ text = doc.at_css('.primary-subject')&.text.to_s
88
+ code = text[/\(([^()]+)\)/, 1]
89
+ build_category(code) if code
90
+ end
91
+
92
+ # Synthesize the link set the abstract page does not expose structurally,
93
+ # so pdf_url, content_types and available_in_pdf? keep working. The PDF
94
+ # link is versioned to match the Atom API (e.g. .../pdf/2601.00470v1).
95
+ def links(arxiv_url)
96
+ versioned_id = arxiv_url.split('/abs/', 2).last
97
+ [
98
+ build_link("#{PDF_URL}#{versioned_id}", 'application/pdf'),
99
+ build_link(arxiv_url, 'text/html')
100
+ ]
101
+ end
102
+
103
+ # The submission history lists every version's timestamp as
104
+ # "[v1] Thu, 1 Jan 2026 20:56:05 UTC". The first is when the paper was
105
+ # published, the last is when it was last revised.
106
+ def submission_dates(doc)
107
+ history = doc.at_css('.submission-history')&.text.to_s
108
+ stamps = history.scan(/\[v\d+\]\s*(.+? UTC)/).flatten
109
+
110
+ return [nil, nil] if stamps.empty?
111
+
112
+ [Time.parse(stamps.first), Time.parse(stamps.last)]
113
+ end
114
+
115
+ def comment(doc)
116
+ text = doc.at_css('td.comments')&.text
117
+ squish(text) if text
118
+ end
119
+
120
+ def build_category(code)
121
+ Arxivarius::Category.new.tap { |category| category.name = code }
122
+ end
123
+
124
+ def build_link(url, content_type)
125
+ Arxivarius::Link.new.tap do |link|
126
+ link.url = url
127
+ link.content_type = content_type
128
+ end
129
+ end
130
+
131
+ def meta(doc, name)
132
+ doc.at_css("meta[name='#{name}']")&.[]('content')
133
+ end
134
+
135
+ def meta_all(doc, name)
136
+ doc.css("meta[name='#{name}']").map { |tag| tag['content'] }
137
+ end
138
+
139
+ def meta_property(doc, property)
140
+ doc.at_css("meta[property='#{property}']")&.[]('content')
141
+ end
142
+
143
+ def squish(string)
144
+ Arxivarius::Text.squish(string)
145
+ end
146
+ end
147
+ end
148
+ end
data/lib/arxivarius.rb CHANGED
@@ -14,11 +14,13 @@ require 'arxivarius/author'
14
14
  require 'arxivarius/link'
15
15
  require 'arxivarius/category'
16
16
  require 'arxivarius/paper'
17
+ require 'arxivarius/web_source'
17
18
 
18
19
  module Arxivarius
19
20
  module Error
20
21
  class PaperNotFound < StandardError; end
21
22
  class MalformedId < StandardError; end
23
+ class ApiError < StandardError; end
22
24
  end
23
25
 
24
26
  # ArXiv uses two ID formats:
@@ -31,18 +33,16 @@ module Arxivarius
31
33
  ID_FORMAT = /^#{CURRENT_URL_FORMAT}/
32
34
 
33
35
  class << self
34
- def get(identifier)
36
+ def get(identifier, source: :api)
35
37
  id = parse_arxiv_identifier(identifier)
36
38
 
37
39
  raise Arxivarius::Error::MalformedId, 'Paper ID format is invalid' unless valid_id?(id)
38
40
 
39
41
  id = normalize_legacy_id(id)
40
42
 
41
- url = URI("https://export.arxiv.org/api/query?id_list=#{id}")
42
- response = ::Nokogiri::XML(Net::HTTP.get(url)).remove_namespaces!
43
- paper = Arxivarius::Paper.parse(response.to_s, single: true)
43
+ paper = fetch_paper(id, source)
44
44
 
45
- # Paper is nil when the API returns no <entry> for the given ID.
45
+ # Paper is nil when the source returns no entry for the given ID.
46
46
  raise Arxivarius::Error::PaperNotFound, "Paper #{id} doesn't exist on arXiv" unless paper&.title
47
47
 
48
48
  paper
@@ -50,6 +50,19 @@ module Arxivarius
50
50
 
51
51
  private
52
52
 
53
+ def fetch_paper(id, source)
54
+ case source
55
+ when :api then fetch_via_api(id)
56
+ when :web then WebSource.fetch(id)
57
+ else raise ArgumentError, "Unknown source: #{source.inspect}"
58
+ end
59
+ end
60
+
61
+ def fetch_via_api(id)
62
+ response = ::Nokogiri::XML(fetch_xml(id)).remove_namespaces!
63
+ Arxivarius::Paper.parse(response.to_s, single: true)
64
+ end
65
+
53
66
  def parse_arxiv_identifier(identifier)
54
67
  if valid_url?(identifier)
55
68
  format = legacy_url?(identifier) ? LEGACY_URL_FORMAT : CURRENT_URL_FORMAT
@@ -71,6 +84,18 @@ module Arxivarius
71
84
  identifier.match?(LEGACY_URL_FORMAT)
72
85
  end
73
86
 
87
+ def fetch_xml(id)
88
+ url = URI("https://export.arxiv.org/api/query?id_list=#{id}")
89
+ response = Net::HTTP.get_response(url)
90
+
91
+ unless response.is_a?(Net::HTTPSuccess)
92
+ message = "ArXiv API returned #{response.code}: #{response.body&.strip}"
93
+ raise Arxivarius::Error::ApiError, message
94
+ end
95
+
96
+ response.body
97
+ end
98
+
74
99
  # The arXiv API no longer resolves subcategory legacy IDs.
75
100
  # Strips the subcategory: math.DG/0510097 -> math/0510097.
76
101
  def normalize_legacy_id(id)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxivarius
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - antlypls
@@ -55,6 +55,7 @@ files:
55
55
  - lib/arxivarius/paper.rb
56
56
  - lib/arxivarius/text.rb
57
57
  - lib/arxivarius/version.rb
58
+ - lib/arxivarius/web_source.rb
58
59
  homepage: https://github.com/antlypls/arxivarius
59
60
  licenses:
60
61
  - MIT
@@ -73,7 +74,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
74
  - !ruby/object:Gem::Version
74
75
  version: '0'
75
76
  requirements: []
76
- rubygems_version: 4.0.6
77
+ rubygems_version: 4.0.8
77
78
  specification_version: 4
78
79
  summary: Fetch and parse papers metadata from arXiv
79
80
  test_files: []