RubyGems - relaton-omg - Versions diffs - 1.18.0 → 1.18.1 - Mend

relaton-omg 1.18.0 → 1.18.1

Files changed (7) hide show

checksums.yaml +4 -4
data/lib/relaton_omg/omg_bibliography.rb +1 -1
data/lib/relaton_omg/scraper.rb +136 -0
data/lib/relaton_omg/version.rb +1 -1
data/lib/relaton_omg.rb +1 -1
metadata +4 -4
data/lib/relaton_omg/scrapper.rb +0 -121

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cfc5b10fea7afe2778adf2b0a5b681b3155305acf823eb5f08d29ac07d6d95ef
-  data.tar.gz: 74f40b36ac9b0ad1c8d979e7b21a1249234a37aba35d3791c4ab44325d75e798
+  metadata.gz: 3aee35ace5d33ef6a4058a8da3c76d59af173315a7ed32cb5967c0dfb9fd8296
+  data.tar.gz: a1138366be2d18d01354d29db3a8fc0870c4c73cbbbe3ac67ffa3700ae85f9bc
 SHA512:
-  metadata.gz: 115a462f0e2c13e91cb89882ef4f5bed4d557b0a7332aa10706229d4091ada41439c47265cea58ea342e117ccb794d4d093ce3fb289352f9440c1de31dd03662
-  data.tar.gz: dfde51a15b0753757b6c9f6776ca803c7a2cd58d0e9677e7b821f238f4f7d521b921804b95418ec47e782796a3ac00d3eeec7b3f88373b34035f62fc6a5f78fd
+  metadata.gz: b5194c85224823ac10a951ed66de9c558659751b967d909d53cbbee37ad60eb750f92b60c079fd1f125c324e23dbb651072a40eefd77018d50776331f7228b37
+  data.tar.gz: 1b9ef36597bec1b10285d0e5f3583615aa774f0b901d98d3042477f6c9368c20e9cf60d4f47d00564f0effc99c9d4d5bd236ab0c62e5ee70531151da899cabc3

data/lib/relaton_omg/omg_bibliography.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module RelatonOmg
       # @param code [String] the OMG standard reference
       # @return [RelatonOmg::OmgBibliographicItem]
       def search(text)
-        Scrapper.scrape_page text
+        Scraper.scrape_page text
       end
       # @param code [String] the OMG standard reference

data/lib/relaton_omg/scraper.rb ADDED Viewed

@@ -0,0 +1,136 @@
+require "nokogiri"
+module RelatonOmg
+  class Scraper
+    URL_PATTERN = "https://www.omg.org/spec/".freeze
+    def initialize(acronym, version = nil, spec = nil)
+      @acronym = acronym
+      @version = version
+      @spec = spec
+    end
+    def self.scrape_page(ref)
+      %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
+      return unless acronym
+      scraper = new(acronym, version, spec)
+      doc = scraper.get_doc
+      return if doc.nil? || scraper.fetch_link.empty?
+      OmgBibliographicItem.new(**scraper.item)
+    end
+    def get_doc
+      @url = "#{URL_PATTERN}#{@acronym}/"
+      @url += @version.gsub(' ', '/') if @version
+      @doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
+    rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
+      return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
+      raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
+    end
+    def item
+      {
+        id: fetch_id,
+        fetched: Date.today.to_s,
+        docid: fetch_docid,
+        title: fetch_title,
+        abstract: fetch_abstract,
+        version: fetch_version,
+        date: fetch_date,
+        docstatus: fetch_status,
+        link: fetch_link,
+        relation: fetch_relation,
+        keyword: fetch_keyword,
+        license: fetch_license,
+      }
+    end
+    def fetch_id
+      "#{@acronym}#{doc_version}#{@spec}"
+    end
+    def fetch_title
+      content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
+      content += ": #{@spec}" if @spec
+      title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
+      [RelatonBib::TypedTitleString.new(type: "main", title: title)]
+    end
+    def fetch_docid
+      id = [@acronym]
+      id << doc_version if doc_version
+      id << @spec if @spec
+      [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
+    end
+    def fetch_abstract
+      content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
+      [{ content: content, language: "en", script: "Latn" }]
+    end
+    def fetch_version
+      [RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
+    end
+    def doc_version
+      @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
+    end
+    def fetch_date
+      [type: "published", on: pub_date.to_s]
+    end
+    def pub_date
+      Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
+    end
+    def fetch_status
+      status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
+      stage = status.text.strip.match(/\w+/).to_s
+      RelatonBib::DocumentStatus.new(stage: stage)
+    end
+    def fetch_link
+      return @link if @link
+      @links = []
+      if @spec
+        a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
+        @links << { type: "src", content: a[:href] } if a
+      else
+        a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
+        @links << { type: "src", content: a[:href] } if a
+        pdf = @doc.at('//a[@class="download-document"]')
+        @links << { type: "pdf", content: pdf[:href] } if pdf
+      end
+      @links
+    end
+    def fetch_relation
+      v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
+      v.reduce([]) do |mem, row|
+        ver = row.at("td").text
+        unless ver == doc_version
+          acronym = row.at("td[3]/a")[:href].split("/")[4]
+          fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
+          bibitem = OmgBibliographicItem.new formattedref: fref
+          mem << { type: "obsoletes", bibitem: bibitem }
+        end
+        mem
+      end
+    end
+    def fetch_keyword
+      @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
+    end
+    def fetch_license
+      @doc.xpath(
+        '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
+      ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
+    end
+  end
+end

data/lib/relaton_omg/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RelatonOmg
-  VERSION = "1.18.0".freeze
+  VERSION = "1.18.1".freeze
 end

data/lib/relaton_omg.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require "relaton_bib"
 require "relaton_omg/version"
 require "relaton_omg/config"
 require "relaton_omg/util"
-require "relaton_omg/scrapper"
+require "relaton_omg/scraper"
 require "relaton_omg/omg_bibliography"
 require "relaton_omg/omg_bibliographic_item"
 require "relaton_omg/xml_parser"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: relaton-omg
 version: !ruby/object:Gem::Version
-  version: 1.18.0
+  version: 1.18.1
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-01-08 00:00:00.000000000 Z
+date: 2024-06-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: relaton-bib
@@ -56,7 +56,7 @@ files:
 - lib/relaton_omg/omg_bibliographic_item.rb
 - lib/relaton_omg/omg_bibliography.rb
 - lib/relaton_omg/processor.rb
-- lib/relaton_omg/scrapper.rb
+- lib/relaton_omg/scraper.rb
 - lib/relaton_omg/util.rb
 - lib/relaton_omg/version.rb
 - lib/relaton_omg/xml_parser.rb
@@ -82,7 +82,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.26
+rubygems_version: 3.3.27
 signing_key:
 specification_version: 4
 summary: 'RelatonOmg: retrieve OMG Standards for bibliographic using the IsoBibliographicItem

data/lib/relaton_omg/scrapper.rb DELETED Viewed

@@ -1,121 +0,0 @@
-require "nokogiri"
-module RelatonOmg
-  module Scrapper
-    URL_PATTERN = "https://www.omg.org/spec/".freeze
-    class << self
-      def scrape_page(ref) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-        %r{OMG (?<acronym>[^\s]+)\s?(?<version>.*)} =~ ref
-        return unless acronym
-        url = URL_PATTERN + acronym
-        url += "/#{version}" if version
-        doc = Nokogiri::HTML OpenURI.open_uri(url, open_timeout: 10)
-        OmgBibliographicItem.new(**item(doc, acronym))
-      rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
-        return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
-        raise RelatonBib::RequestError, "Unable acces #{url} (#{e.io.status.join(' ')})"
-      end
-      private
-      def item(doc, acronym) # rubocop:disable Metrics/MethodLength
-        {
-          id: fetch_id(doc, acronym),
-          fetched: Date.today.to_s,
-          docid: fetch_docid(doc, acronym),
-          title: fetch_title(doc),
-          abstract: fetch_abstract(doc),
-          version: fetch_version(doc),
-          date: fetch_date(doc),
-          docstatus: fetch_status(doc),
-          link: fetch_link(doc),
-          relation: fetch_relation(doc),
-          keyword: fetch_keyword(doc),
-          license: fetch_license(doc),
-        }
-      end
-      def fetch_id(doc, acronym)
-        acronym + version(doc)
-      end
-      def fetch_title(doc)
-        content = doc.at('//dt[.="Title:"]/following-sibling::dd').text
-        title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
-        [RelatonBib::TypedTitleString.new(type: "main", title: title)]
-      end
-      def fetch_docid(doc, acronym)
-        id = [acronym]
-        if (ver = version(doc))
-          id << ver
-        end
-        [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
-      end
-      def fetch_abstract(doc)
-        content = doc.at('//section[@id="document-metadata"]/div/div/p').text
-        [{ content: content, language: "en", script: "Latn" }]
-      end
-      def fetch_version(doc)
-        [RelatonBib::BibliographicItem::Version.new(pub_date(doc), version(doc))]
-      end
-      def version(doc)
-        doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
-      end
-      def fetch_date(doc)
-        [type: "published", on: pub_date(doc).to_s]
-      end
-      def pub_date(doc)
-        Date.parse doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
-      end
-      def fetch_status(doc)
-        status = doc.at('//dt[.="Document Status:"]/following-sibling::dd')
-        stage = status.text.strip.match(/\w+/).to_s
-        RelatonBib::DocumentStatus.new(stage: stage)
-      end
-      def fetch_link(doc)
-        links = []
-        a = doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
-        links << { type: "src", content: a[:href] } if a
-        pdf = doc.at('//a[@class="download-document"]')
-        links << { type: "pdf", content: pdf[:href] } if pdf
-        links
-      end
-      def fetch_relation(doc) # rubocop:disable Metrics/MethodLength
-        current_version = version(doc)
-        v = doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
-        v.reduce([]) do |mem, row|
-          ver = row.at("td").text
-          unless ver == current_version
-            acronym = row.at("td[3]/a")[:href].split("/")[4]
-            fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
-            bibitem = OmgBibliographicItem.new formattedref: fref
-            mem << { type: "obsoletes", bibitem: bibitem }
-          end
-          mem
-        end
-      end
-      def fetch_keyword(doc)
-        doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
-      end
-      def fetch_license(doc)
-        doc.xpath(
-          '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
-        ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
-      end
-    end
-  end
-end