RubyGems - arxivarius - Versions diffs - 0.10.0 → 0.12.0 - Mend

arxivarius 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +18 -0
data/lib/arxivarius/version.rb +1 -1
data/lib/arxivarius/web_source.rb +148 -0
data/lib/arxivarius.rb +30 -5
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ae569c1030d082bb94e10a63472ef34aeb89bb9bef56db25c57f68254dc00be8
-  data.tar.gz: f3f664f4a8feb5b7a585166187590018ae68f7e98cd5ae290a85259baab40222
+  metadata.gz: fd93f41d3a1a1d7e703ac7637a158b91c99eceb592adbdfa3b95577a943ce096
+  data.tar.gz: '0399895405a2e0df92c2164dce3e047c0ada3ee97e02a7342207b642ccc8c759'
 SHA512:
-  metadata.gz: f218c5e45f1f1305ab9437a205f9c35db5494bc5491af3d6d19c560e025ba452990caff7fb709fcde5e07302fe73ad8ea9e2a6f043831873cdd076c2fd8a8ccf
-  data.tar.gz: 95396dc42e302e54b4e6ae42475bd375ee77d8f49b04ee6383e57dfb0a8d144c2cc1c59cd135ea57c043fa1be8c9fc575c8cf09240dea03179e939097415ca7b
+  metadata.gz: 33b5fb64c8b9583bca078e5f1b612942346ce3b67334db53ee83aaf86944001f255b668e6a8c6ff897573ff13d5fd6f3b33f22cbff70e06f2c9de4ce06ce006c
+  data.tar.gz: 44729f9aef41d60f10bbb7e3f81235ac51f3cef4446d2d2bdcbb2e8d8537151b38baac0350b21c110fbde0eec2d8bca89472705f9e37ad1cd55c4e27f8cb8210

data/README.md CHANGED Viewed

@@ -27,6 +27,24 @@ Pass any arXiv ID to `Arxivarius.get`:
 paper = Arxivarius.get('2601.00470')
 ```
+### Data sources
+By default the gem queries the arXiv Atom API (`export.arxiv.org`). That host
+has been unstable lately and sometimes responds with `429 Rate exceeded`
+(`Arxivarius::Error::ApiError`). When it does, you can fall back to scraping the
+public abstract page (`arxiv.org/abs/...`) instead:
+```ruby
+Arxivarius.get('2601.00470')                # arXiv API (default)
+Arxivarius.get('2601.00470', source: :api)  # arXiv API, explicitly
+Arxivarius.get('2601.00470', source: :web)  # scrape the abstract page
+```
+Both sources return the same `Paper` object, so the rest of the API below works
+unchanged. The web source recovers every field the API does **except author
+affiliations**, which the abstract page does not list (`author.affiliations`
+is `[]`).
 All common ID formats work:
 ```ruby

data/lib/arxivarius/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Arxivarius
-  VERSION = '0.10.0'
+  VERSION = '0.12.0'
 end

data/lib/arxivarius/web_source.rb ADDED Viewed

@@ -0,0 +1,148 @@
+# frozen_string_literal: true
+module Arxivarius
+  # Builds a Paper by scraping the public arXiv abstract page
+  # (https://arxiv.org/abs/<id>), used as an alternative to the Atom API when
+  # it is rate limited. Reads the Highwire `citation_*` <meta> tags plus a few
+  # body elements. Every field the API exposes is recovered except author
+  # affiliations, which are not present on the abstract page.
+  module WebSource
+    ABS_URL = 'https://arxiv.org/abs/'
+    PDF_URL = 'https://arxiv.org/pdf/'
+    USER_AGENT = "arxivarius/#{Arxivarius::VERSION} " \
+                 '(+https://github.com/antlypls/arxivarius)'.freeze
+    class << self
+      def fetch(id)
+        doc = ::Nokogiri::HTML(fetch_html(id))
+        # No citation_title means the page is not an abstract page (e.g. an
+        # arXiv "identifier not recognized" page served with a 200 status).
+        return nil unless meta(doc, 'citation_title')
+        build_paper(doc)
+      end
+      private
+      def fetch_html(id)
+        url = URI("#{ABS_URL}#{id}")
+        response = Net::HTTP.get_response(url, 'User-Agent' => USER_AGENT)
+        return nil if response.is_a?(Net::HTTPNotFound)
+        unless response.is_a?(Net::HTTPSuccess)
+          message = "ArXiv returned #{response.code}: #{response.body&.strip}"
+          raise Arxivarius::Error::ApiError, message
+        end
+        response.body
+      end
+      def build_paper(doc)
+        Arxivarius::Paper.new.tap do |paper|
+          apply_metadata(paper, doc)
+          paper.created_at, paper.updated_at = submission_dates(doc)
+          apply_associations(paper, doc)
+        end
+      end
+      def apply_metadata(paper, doc)
+        paper.arxiv_url = meta_property(doc, 'og:url')
+        paper.title = squish(meta(doc, 'citation_title'))
+        paper.summary = squish(meta(doc, 'citation_abstract'))
+        paper.comment = comment(doc)
+      end
+      def apply_associations(paper, doc)
+        paper.authors = authors(doc)
+        paper.categories = categories(doc)
+        paper.primary_category = primary_category(doc)
+        paper.links = links(paper.arxiv_url)
+      end
+      # arXiv lists authors as "Last, First" in citation_author; reorder to
+      # "First Last" so they match the Atom API output exactly.
+      def authors(doc)
+        meta_all(doc, 'citation_author').map do |raw|
+          last, first = raw.split(',', 2).map { |part| squish(part) }
+          name = first ? "#{first} #{last}" : last
+          # Affiliations are not on the abstract page; match the API's empty
+          # list rather than leaving them nil.
+          Arxivarius::Author.new.tap do |author|
+            author.name = name
+            author.affiliations = []
+          end
+        end
+      end
+      def categories(doc)
+        subjects = doc.at_css('td.subjects')&.text.to_s
+        subjects.scan(/\(([^()]+)\)/).flatten.map do |code|
+          build_category(code)
+        end
+      end
+      def primary_category(doc)
+        text = doc.at_css('.primary-subject')&.text.to_s
+        code = text[/\(([^()]+)\)/, 1]
+        build_category(code) if code
+      end
+      # Synthesize the link set the abstract page does not expose structurally,
+      # so pdf_url, content_types and available_in_pdf? keep working. The PDF
+      # link is versioned to match the Atom API (e.g. .../pdf/2601.00470v1).
+      def links(arxiv_url)
+        versioned_id = arxiv_url.split('/abs/', 2).last
+        [
+          build_link("#{PDF_URL}#{versioned_id}", 'application/pdf'),
+          build_link(arxiv_url, 'text/html')
+        ]
+      end
+      # The submission history lists every version's timestamp as
+      # "[v1] Thu, 1 Jan 2026 20:56:05 UTC". The first is when the paper was
+      # published, the last is when it was last revised.
+      def submission_dates(doc)
+        history = doc.at_css('.submission-history')&.text.to_s
+        stamps = history.scan(/\[v\d+\]\s*(.+? UTC)/).flatten
+        return [nil, nil] if stamps.empty?
+        [Time.parse(stamps.first), Time.parse(stamps.last)]
+      end
+      def comment(doc)
+        text = doc.at_css('td.comments')&.text
+        squish(text) if text
+      end
+      def build_category(code)
+        Arxivarius::Category.new.tap { |category| category.name = code }
+      end
+      def build_link(url, content_type)
+        Arxivarius::Link.new.tap do |link|
+          link.url = url
+          link.content_type = content_type
+        end
+      end
+      def meta(doc, name)
+        doc.at_css("meta[name='#{name}']")&.[]('content')
+      end
+      def meta_all(doc, name)
+        doc.css("meta[name='#{name}']").map { |tag| tag['content'] }
+      end
+      def meta_property(doc, property)
+        doc.at_css("meta[property='#{property}']")&.[]('content')
+      end
+      def squish(string)
+        Arxivarius::Text.squish(string)
+      end
+    end
+  end
+end

data/lib/arxivarius.rb CHANGED Viewed

@@ -14,11 +14,13 @@ require 'arxivarius/author'
 require 'arxivarius/link'
 require 'arxivarius/category'
 require 'arxivarius/paper'
+require 'arxivarius/web_source'
 module Arxivarius
   module Error
     class PaperNotFound < StandardError; end
     class MalformedId < StandardError; end
+    class ApiError < StandardError; end
   end
   # ArXiv uses two ID formats:
@@ -31,18 +33,16 @@ module Arxivarius
   ID_FORMAT = /^#{CURRENT_URL_FORMAT}/
   class << self
-    def get(identifier)
+    def get(identifier, source: :api)
       id = parse_arxiv_identifier(identifier)
       raise Arxivarius::Error::MalformedId, 'Paper ID format is invalid' unless valid_id?(id)
       id = normalize_legacy_id(id)
-      url = URI("https://export.arxiv.org/api/query?id_list=#{id}")
-      response = ::Nokogiri::XML(Net::HTTP.get(url)).remove_namespaces!
-      paper = Arxivarius::Paper.parse(response.to_s, single: true)
+      paper = fetch_paper(id, source)
-      # Paper is nil when the API returns no <entry> for the given ID.
+      # Paper is nil when the source returns no entry for the given ID.
       raise Arxivarius::Error::PaperNotFound, "Paper #{id} doesn't exist on arXiv" unless paper&.title
       paper
@@ -50,6 +50,19 @@ module Arxivarius
     private
+    def fetch_paper(id, source)
+      case source
+      when :api then fetch_via_api(id)
+      when :web then WebSource.fetch(id)
+      else raise ArgumentError, "Unknown source: #{source.inspect}"
+      end
+    end
+    def fetch_via_api(id)
+      response = ::Nokogiri::XML(fetch_xml(id)).remove_namespaces!
+      Arxivarius::Paper.parse(response.to_s, single: true)
+    end
     def parse_arxiv_identifier(identifier)
       if valid_url?(identifier)
         format = legacy_url?(identifier) ? LEGACY_URL_FORMAT : CURRENT_URL_FORMAT
@@ -71,6 +84,18 @@ module Arxivarius
       identifier.match?(LEGACY_URL_FORMAT)
     end
+    def fetch_xml(id)
+      url = URI("https://export.arxiv.org/api/query?id_list=#{id}")
+      response = Net::HTTP.get_response(url)
+      unless response.is_a?(Net::HTTPSuccess)
+        message = "ArXiv API returned #{response.code}: #{response.body&.strip}"
+        raise Arxivarius::Error::ApiError, message
+      end
+      response.body
+    end
     # The arXiv API no longer resolves subcategory legacy IDs.
     # Strips the subcategory: math.DG/0510097 -> math/0510097.
     def normalize_legacy_id(id)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: arxivarius
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.12.0
 platform: ruby
 authors:
 - antlypls
@@ -55,6 +55,7 @@ files:
 - lib/arxivarius/paper.rb
 - lib/arxivarius/text.rb
 - lib/arxivarius/version.rb
+- lib/arxivarius/web_source.rb
 homepage: https://github.com/antlypls/arxivarius
 licenses:
 - MIT
@@ -73,7 +74,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.6
+rubygems_version: 4.0.8
 specification_version: 4
 summary: Fetch and parse papers metadata from arXiv
 test_files: []