arxivarius 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7fb7f547ba6bce8ce3f0f8558dcab98ac276cdd24b9b7576551b2da85c0ca8c6
4
- data.tar.gz: 2f8426e7c7c1f7889808bbd72117ad2af9cf416236c4cc93b6060ea7943be57b
3
+ metadata.gz: fd93f41d3a1a1d7e703ac7637a158b91c99eceb592adbdfa3b95577a943ce096
4
+ data.tar.gz: '0399895405a2e0df92c2164dce3e047c0ada3ee97e02a7342207b642ccc8c759'
5
5
  SHA512:
6
- metadata.gz: 2cbe81bb238d7fe52fe449a36faf32e5b5b16d96a338b081accb8c924b5bfd972fb74cd2d06aeb8752d0788bab623976a4649c1fd5e82eb66fe62ecb04a0f979
7
- data.tar.gz: 48ef9e189a64ab2cb93bdf201c97a8407d60862bc370739af3bd7fa4f298c914ea89e6cc751fdaddaea90492027c6d5dffafecc3317e97e4198744b5db78b7f6
6
+ metadata.gz: 33b5fb64c8b9583bca078e5f1b612942346ce3b67334db53ee83aaf86944001f255b668e6a8c6ff897573ff13d5fd6f3b33f22cbff70e06f2c9de4ce06ce006c
7
+ data.tar.gz: 44729f9aef41d60f10bbb7e3f81235ac51f3cef4446d2d2bdcbb2e8d8537151b38baac0350b21c110fbde0eec2d8bca89472705f9e37ad1cd55c4e27f8cb8210
data/README.md CHANGED
@@ -27,6 +27,24 @@ Pass any arXiv ID to `Arxivarius.get`:
27
27
  paper = Arxivarius.get('2601.00470')
28
28
  ```
29
29
 
30
+ ### Data sources
31
+
32
+ By default the gem queries the arXiv Atom API (`export.arxiv.org`). That host
33
+ has been unstable lately and sometimes responds with `429 Rate exceeded`
34
+ (`Arxivarius::Error::ApiError`). When it does, you can fall back to scraping the
35
+ public abstract page (`arxiv.org/abs/...`) instead:
36
+
37
+ ```ruby
38
+ Arxivarius.get('2601.00470') # arXiv API (default)
39
+ Arxivarius.get('2601.00470', source: :api) # arXiv API, explicitly
40
+ Arxivarius.get('2601.00470', source: :web) # scrape the abstract page
41
+ ```
42
+
43
+ Both sources return the same `Paper` object, so the rest of the API below works
44
+ unchanged. The web source recovers every field the API does **except author
45
+ affiliations**, which the abstract page does not list (`author.affiliations`
46
+ is `[]`).
47
+
30
48
  All common ID formats work:
31
49
 
32
50
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Arxivarius
4
- VERSION = '0.11.0'
4
+ VERSION = '0.12.0'
5
5
  end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Arxivarius
4
+ # Builds a Paper by scraping the public arXiv abstract page
5
+ # (https://arxiv.org/abs/<id>), used as an alternative to the Atom API when
6
+ # it is rate limited. Reads the Highwire `citation_*` <meta> tags plus a few
7
+ # body elements. Every field the API exposes is recovered except author
8
+ # affiliations, which are not present on the abstract page.
9
+ module WebSource
10
+ ABS_URL = 'https://arxiv.org/abs/'
11
+ PDF_URL = 'https://arxiv.org/pdf/'
12
+ USER_AGENT = "arxivarius/#{Arxivarius::VERSION} " \
13
+ '(+https://github.com/antlypls/arxivarius)'.freeze
14
+
15
+ class << self
16
+ def fetch(id)
17
+ doc = ::Nokogiri::HTML(fetch_html(id))
18
+
19
+ # No citation_title means the page is not an abstract page (e.g. an
20
+ # arXiv "identifier not recognized" page served with a 200 status).
21
+ return nil unless meta(doc, 'citation_title')
22
+
23
+ build_paper(doc)
24
+ end
25
+
26
+ private
27
+
28
+ def fetch_html(id)
29
+ url = URI("#{ABS_URL}#{id}")
30
+ response = Net::HTTP.get_response(url, 'User-Agent' => USER_AGENT)
31
+
32
+ return nil if response.is_a?(Net::HTTPNotFound)
33
+
34
+ unless response.is_a?(Net::HTTPSuccess)
35
+ message = "ArXiv returned #{response.code}: #{response.body&.strip}"
36
+ raise Arxivarius::Error::ApiError, message
37
+ end
38
+
39
+ response.body
40
+ end
41
+
42
+ def build_paper(doc)
43
+ Arxivarius::Paper.new.tap do |paper|
44
+ apply_metadata(paper, doc)
45
+ paper.created_at, paper.updated_at = submission_dates(doc)
46
+ apply_associations(paper, doc)
47
+ end
48
+ end
49
+
50
+ def apply_metadata(paper, doc)
51
+ paper.arxiv_url = meta_property(doc, 'og:url')
52
+ paper.title = squish(meta(doc, 'citation_title'))
53
+ paper.summary = squish(meta(doc, 'citation_abstract'))
54
+ paper.comment = comment(doc)
55
+ end
56
+
57
+ def apply_associations(paper, doc)
58
+ paper.authors = authors(doc)
59
+ paper.categories = categories(doc)
60
+ paper.primary_category = primary_category(doc)
61
+ paper.links = links(paper.arxiv_url)
62
+ end
63
+
64
+ # arXiv lists authors as "Last, First" in citation_author; reorder to
65
+ # "First Last" so they match the Atom API output exactly.
66
+ def authors(doc)
67
+ meta_all(doc, 'citation_author').map do |raw|
68
+ last, first = raw.split(',', 2).map { |part| squish(part) }
69
+ name = first ? "#{first} #{last}" : last
70
+ # Affiliations are not on the abstract page; match the API's empty
71
+ # list rather than leaving them nil.
72
+ Arxivarius::Author.new.tap do |author|
73
+ author.name = name
74
+ author.affiliations = []
75
+ end
76
+ end
77
+ end
78
+
79
+ def categories(doc)
80
+ subjects = doc.at_css('td.subjects')&.text.to_s
81
+ subjects.scan(/\(([^()]+)\)/).flatten.map do |code|
82
+ build_category(code)
83
+ end
84
+ end
85
+
86
+ def primary_category(doc)
87
+ text = doc.at_css('.primary-subject')&.text.to_s
88
+ code = text[/\(([^()]+)\)/, 1]
89
+ build_category(code) if code
90
+ end
91
+
92
+ # Synthesize the link set the abstract page does not expose structurally,
93
+ # so pdf_url, content_types and available_in_pdf? keep working. The PDF
94
+ # link is versioned to match the Atom API (e.g. .../pdf/2601.00470v1).
95
+ def links(arxiv_url)
96
+ versioned_id = arxiv_url.split('/abs/', 2).last
97
+ [
98
+ build_link("#{PDF_URL}#{versioned_id}", 'application/pdf'),
99
+ build_link(arxiv_url, 'text/html')
100
+ ]
101
+ end
102
+
103
+ # The submission history lists every version's timestamp as
104
+ # "[v1] Thu, 1 Jan 2026 20:56:05 UTC". The first is when the paper was
105
+ # published, the last is when it was last revised.
106
+ def submission_dates(doc)
107
+ history = doc.at_css('.submission-history')&.text.to_s
108
+ stamps = history.scan(/\[v\d+\]\s*(.+? UTC)/).flatten
109
+
110
+ return [nil, nil] if stamps.empty?
111
+
112
+ [Time.parse(stamps.first), Time.parse(stamps.last)]
113
+ end
114
+
115
+ def comment(doc)
116
+ text = doc.at_css('td.comments')&.text
117
+ squish(text) if text
118
+ end
119
+
120
+ def build_category(code)
121
+ Arxivarius::Category.new.tap { |category| category.name = code }
122
+ end
123
+
124
+ def build_link(url, content_type)
125
+ Arxivarius::Link.new.tap do |link|
126
+ link.url = url
127
+ link.content_type = content_type
128
+ end
129
+ end
130
+
131
+ def meta(doc, name)
132
+ doc.at_css("meta[name='#{name}']")&.[]('content')
133
+ end
134
+
135
+ def meta_all(doc, name)
136
+ doc.css("meta[name='#{name}']").map { |tag| tag['content'] }
137
+ end
138
+
139
+ def meta_property(doc, property)
140
+ doc.at_css("meta[property='#{property}']")&.[]('content')
141
+ end
142
+
143
+ def squish(string)
144
+ Arxivarius::Text.squish(string)
145
+ end
146
+ end
147
+ end
148
+ end
data/lib/arxivarius.rb CHANGED
@@ -14,6 +14,7 @@ require 'arxivarius/author'
14
14
  require 'arxivarius/link'
15
15
  require 'arxivarius/category'
16
16
  require 'arxivarius/paper'
17
+ require 'arxivarius/web_source'
17
18
 
18
19
  module Arxivarius
19
20
  module Error
@@ -32,17 +33,16 @@ module Arxivarius
32
33
  ID_FORMAT = /^#{CURRENT_URL_FORMAT}/
33
34
 
34
35
  class << self
35
- def get(identifier)
36
+ def get(identifier, source: :api)
36
37
  id = parse_arxiv_identifier(identifier)
37
38
 
38
39
  raise Arxivarius::Error::MalformedId, 'Paper ID format is invalid' unless valid_id?(id)
39
40
 
40
41
  id = normalize_legacy_id(id)
41
42
 
42
- response = ::Nokogiri::XML(fetch_xml(id)).remove_namespaces!
43
- paper = Arxivarius::Paper.parse(response.to_s, single: true)
43
+ paper = fetch_paper(id, source)
44
44
 
45
- # Paper is nil when the API returns no <entry> for the given ID.
45
+ # Paper is nil when the source returns no entry for the given ID.
46
46
  raise Arxivarius::Error::PaperNotFound, "Paper #{id} doesn't exist on arXiv" unless paper&.title
47
47
 
48
48
  paper
@@ -50,6 +50,19 @@ module Arxivarius
50
50
 
51
51
  private
52
52
 
53
+ def fetch_paper(id, source)
54
+ case source
55
+ when :api then fetch_via_api(id)
56
+ when :web then WebSource.fetch(id)
57
+ else raise ArgumentError, "Unknown source: #{source.inspect}"
58
+ end
59
+ end
60
+
61
+ def fetch_via_api(id)
62
+ response = ::Nokogiri::XML(fetch_xml(id)).remove_namespaces!
63
+ Arxivarius::Paper.parse(response.to_s, single: true)
64
+ end
65
+
53
66
  def parse_arxiv_identifier(identifier)
54
67
  if valid_url?(identifier)
55
68
  format = legacy_url?(identifier) ? LEGACY_URL_FORMAT : CURRENT_URL_FORMAT
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxivarius
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - antlypls
@@ -55,6 +55,7 @@ files:
55
55
  - lib/arxivarius/paper.rb
56
56
  - lib/arxivarius/text.rb
57
57
  - lib/arxivarius/version.rb
58
+ - lib/arxivarius/web_source.rb
58
59
  homepage: https://github.com/antlypls/arxivarius
59
60
  licenses:
60
61
  - MIT