RubyGems - relaton-iso - Versions diffs - 1.16.2 → 1.16.4 - Mend

relaton-iso 1.16.2 → 1.16.4

Files changed (6) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +2 -0
data/lib/relaton_iso/processor.rb +1 -1
data/lib/relaton_iso/scrapper.rb +378 -367
data/lib/relaton_iso/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 479a728a58c56799448fd6d468e0d19fe245b731119f8dcd9ae6f19a7b624e07
-  data.tar.gz: ac89507180ca01978bfe98b68fbe02450f2c33015bd38d788752f3bf933911ad
+  metadata.gz: 24973dcb87074a6029a83761f4690db8c53203e7a65928ea766ddeaa61b6d167
+  data.tar.gz: 15f8936150781349e849ec0a89edc2ba1e24cb7a105973d6479418a7c98f9ffa
 SHA512:
-  metadata.gz: 71cc49dc2afa8690f02f7035ec5cc13981eb620e2b8c3792456401c152a4ca8192b2ffbd7445c6c982886e61f679427a2d5afbf26e13c6ebcfffcc8d54f7e5c9
-  data.tar.gz: 853da0772a998533c5f461ff297bef978c75e1f58b2df1fec5eff0fea6d306807420453a6ba37b1348d44e58c3a71dcf743228c7703b73fd9b7d72c9d4309598
+  metadata.gz: ae2ac0909781b8f8f196a259444cc55e3fc8d92eccca7f0d83da769dd27b10c0c95a3cc59e391a52e27cc0320f3149f4498b8004e8d259c98fa9bfa3947b7b81
+  data.tar.gz: 4143f5870f15be800efe77cac822c90e7a345c5d6613b1481e7f88598b7d50f701546eaf9138c8bbebde22ad6473a5a7c7e4f81768b3656855e43d91d7ec10d8

data/.github/workflows/rake.yml CHANGED Viewed

@@ -7,6 +7,8 @@ on:
     branches: [ master, main ]
     tags: [ v* ]
   pull_request:
+  schedule:
+    - cron: '0 0 * * *'
 jobs:
   rake:

data/lib/relaton_iso/processor.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module RelatonIso
   class Processor < Relaton::Processor
     attr_reader :idtype
-    def initialize
+    def initialize # rubocop:disable Lint/MissingSuper
       @short = :relaton_iso
       @prefix = "ISO"
       @defaultprefix = %r{^ISO(/IEC)?\s}

data/lib/relaton_iso/scrapper.rb CHANGED Viewed

@@ -43,418 +43,429 @@ module RelatonIso
                   url: "www.asme.org" },
     }.freeze
-    class << self
-      # Parse page.
-      # @param hit [RelatonIso::Hit]
-      # @param lang [String, NilClass]
-      # @return [RelatonIsoBib::IsoBibliographicItem]
-      def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-        # path = "/contents/data/standard#{hit_data['splitPath']}/"\
-        # "#{hit_data['csnumber']}.html"
-        path = hit.hit[:path].sub("/sites/isoorg", "")
-        doc, url = get_page "#{path}.html"
-        # Fetch edition.
-        edition = doc.at("//div[div[.='Edition']]/text()[last()]")
-          &.text&.match(/\d+$/)&.to_s
-        hit.pubid.base.edition ||= edition if hit.pubid.base
-        titles, abstract, langs = fetch_titles_abstract(doc, lang)
-        RelatonIsoBib::IsoBibliographicItem.new(
-          fetched: Date.today.to_s,
-          docid: fetch_relaton_docids(doc, hit.pubid),
-          docnumber: fetch_docnumber(hit.pubid),
-          edition: edition,
-          language: langs.map { |l| l[:lang] },
-          script: langs.map { |l| script(l[:lang]) }.uniq,
-          title: titles,
-          doctype: fetch_type(hit.hit[:title]),
-          docstatus: fetch_status(doc),
-          ics: fetch_ics(doc),
-          date: fetch_dates(doc, hit.hit[:title]),
-          contributor: fetch_contributors(hit.hit[:title]),
-          editorialgroup: fetch_workgroup(doc),
-          abstract: abstract,
-          copyright: fetch_copyright(doc),
-          link: fetch_link(doc, url),
-          relation: fetch_relations(doc),
-          place: ["Geneva"],
-          structuredidentifier: fetch_structuredidentifier(hit.pubid),
-        )
-      end
+    extend self
+    # Parse page.
+    # @param hit [RelatonIso::Hit]
+    # @param lang [String, NilClass]
+    # @return [RelatonIsoBib::IsoBibliographicItem]
+    def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+      # path = "/contents/data/standard#{hit_data['splitPath']}/"\
+      # "#{hit_data['csnumber']}.html"
+      path = hit.hit[:path].sub("/sites/isoorg", "")
+      doc, url = get_page "#{path}.html"
+      # Fetch edition.
+      edition = doc.at("//div[div[.='Edition']]/text()[last()]")
+        &.text&.match(/\d+$/)&.to_s
+      hit.pubid.base.edition ||= edition if hit.pubid.base
+      titles, abstract, langs = fetch_titles_abstract(doc, lang)
+      RelatonIsoBib::IsoBibliographicItem.new(
+        fetched: Date.today.to_s,
+        docid: fetch_relaton_docids(doc, hit.pubid),
+        docnumber: fetch_docnumber(hit.pubid),
+        edition: edition,
+        language: langs.map { |l| l[:lang] },
+        script: langs.map { |l| script(l[:lang]) }.uniq,
+        title: titles,
+        doctype: fetch_type(hit.hit[:title]),
+        docstatus: fetch_status(doc),
+        ics: fetch_ics(doc),
+        date: fetch_dates(doc, hit.hit[:title]),
+        contributor: fetch_contributors(hit.hit[:title]),
+        editorialgroup: fetch_workgroup(doc),
+        abstract: abstract,
+        copyright: fetch_copyright(doc),
+        link: fetch_link(doc, url),
+        relation: fetch_relations(doc),
+        place: ["Geneva"],
+        structuredidentifier: fetch_structuredidentifier(hit.pubid),
+      )
+    end
-      #
-      # Create document ids.
-      #
-      # @param doc [Nokogiri::HTML::Document] document to parse
-      # @param pubid [Pubid::Iso::Identifier] publication identifier
-      #
-      # @return [Array<RelatonBib::DocumentIdentifier>]
-      #
-      def fetch_relaton_docids(doc, pubid)
-        pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
-        [
-          RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
-          RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
-          RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
-        ]
-      end
+    #
+    # Create document ids.
+    #
+    # @param doc [Nokogiri::HTML::Document] document to parse
+    # @param pubid [Pubid::Iso::Identifier] publication identifier
+    #
+    # @return [Array<RelatonBib::DocumentIdentifier>]
+    #
+    def fetch_relaton_docids(doc, pubid)
+      pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
+      [
+        RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
+        RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
+        RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
+      ]
+    end
-      #
-      # Create ISO reference identifier with English language.
-      #
-      # @param [Pubid::Iso::Identifier] pubid publication identifier
-      #
-      # @return [String] English reference identifier
-      #
-      def isoref(pubid)
-        params = pubid.get_params.reject { |k, _| k == :typed_stage }
-        Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
-      end
+    #
+    # Create ISO reference identifier with English language.
+    #
+    # @param [Pubid::Iso::Identifier] pubid publication identifier
+    #
+    # @return [String] English reference identifier
+    #
+    def isoref(pubid)
+      params = pubid.get_params.reject { |k, _| k == :typed_stage }
+      Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
+    end
-      private
-      # Fetch titles and abstracts.
-      # @param doc [Nokigiri::HTML::Document]
-      # @param lang [String, NilClass]
-      # @return [Array<Array>]
-      def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
-        titles   = RelatonBib::TypedTitleStringCollection.new
-        abstract = []
-        langs = languages(doc, lang).reduce([]) do |s, l|
-          # Don't need to get page for en. We already have it.
-          d = l[:path] ? get_page(l[:path])[0] : doc
-          unless d.at("//h5[@class='help-block']" \
-                      "[.='недоступно на русском языке']")
-            s << l
-            titles += fetch_title(d, l[:lang])
-            # Fetch abstracts.
-            abstract_content = d.xpath(
-              "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
-            ).map do |a|
-              a.name == "li" ? "- #{a.text}" : a.text
-            end.reject(&:empty?).join("\n")
-            unless abstract_content.empty?
-              abstract << {
-                content: abstract_content,
-                language: l[:lang],
-                script: script(l[:lang]),
-                format: "text/plain",
-              }
-            end
+    private
+    # Fetch titles and abstracts.
+    # @param doc [Nokigiri::HTML::Document]
+    # @param lang [String, NilClass]
+    # @return [Array<Array>]
+    def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
+      titles   = RelatonBib::TypedTitleStringCollection.new
+      abstract = []
+      langs = languages(doc, lang).reduce([]) do |s, l|
+        # Don't need to get page for en. We already have it.
+        d = l[:path] ? get_page(l[:path])[0] : doc
+        unless d.at("//h5[@class='help-block']" \
+                    "[.='недоступно на русском языке']")
+          s << l
+          titles += fetch_title(d, l[:lang])
+          # Fetch abstracts.
+          abstract_content = d.xpath(
+            "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
+          ).map do |a|
+            a.name == "li" ? "- #{a.text}" : a.text
+          end.reject(&:empty?).join("\n")
+          unless abstract_content.empty?
+            abstract << {
+              content: abstract_content,
+              language: l[:lang],
+              script: script(l[:lang]),
+              format: "text/plain",
+            }
           end
-          s
         end
-        [titles, abstract, langs]
+        s
       end
+      [titles, abstract, langs]
+    end
-      # Returns available languages.
-      # @param doc [Nokogiri::HTML::Document]
-      # @pqrqm lang [String, NilClass]
-      # @return [Array<Hash>]
-      def languages(doc, lang)
-        lgs = [{ lang: "en" }]
-        doc.css("li#lang-switcher ul li a").each do |lang_link|
-          lang_path = lang_link.attr("href")
-          l = lang_path.match(%r{^/(fr)/})
-          lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
-        end
-        lgs
+    # Returns available languages.
+    # @param doc [Nokogiri::HTML::Document]
+    # @pqrqm lang [String, NilClass]
+    # @return [Array<Hash>]
+    def languages(doc, lang)
+      lgs = [{ lang: "en" }]
+      doc.css("li#lang-switcher ul li a").each do |lang_link|
+        lang_path = lang_link.attr("href")
+        l = lang_path.match(%r{^/(fr)/})
+        lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
       end
+      lgs
+    end
-      # Get page.
-      # @param path [String] page's path
-      # @return [Array<Nokogiri::HTML::Document, String>]
-      def get_page(path)
-        resp, uri = get_redirection path
-        doc = try_if_fail resp, uri
-        [doc, uri.to_s]
-      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
-             EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
-             Net::ProtocolError, Errno::ETIMEDOUT
-        raise RelatonBib::RequestError, "Could not access #{uri}"
-      end
+    # Get page.
+    # @param path [String] page's path
+    # @return [Array<Nokogiri::HTML::Document, String>]
+    def get_page(path)
+      resp, uri = get_redirection path
+      doc = try_if_fail resp, uri
+      [doc, uri.to_s]
+    rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
+            EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
+            Net::ProtocolError, Errno::ETIMEDOUT
+      raise RelatonBib::RequestError, "Could not access #{uri}"
+    end
-      #
-      # Get the page from the given path. If the page is redirected, get the
-      # page from the new path.
-      #
-      # @param [String] path path to the page
-      #
-      # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
-      # @raise [RelatonBib::RequestError] if the page is not found
-      #
-      def get_redirection(path)
-        url = DOMAIN + path
-        uri = URI url
-        resp = Net::HTTP.get_response(uri)
-        raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
+    #
+    # Get the page from the given path. If the page is redirected, get the
+    # page from the new path.
+    #
+    # @param [String] path path to the page
+    #
+    # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
+    # @raise [RelatonBib::RequestError] if the page is not found
+    #
+    def get_redirection(path)
+      url = DOMAIN + path
+      uri = URI url
+      resp = Net::HTTP.get_response(uri)
+      raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
+      resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
+    end
-        resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
-      end
+    #
+    # The iso.org site fails to respond sometimes. This method tries to get
+    # the response again.
+    #
+    # @param [Net::HTTPOK] resp HTTP response
+    # @param [URI::HTTPS] uri URI of the page
+    #
+    # @return [Nokogiri::HTML4::Document] document
+    # @raise [RelatonBib::RequestError] if the page could not be parsed
+    #
+    def try_if_fail(resp, uri)
+      10.times do
+        doc = Nokogiri::HTML(resp.body)
+        # stop trying if page has a document id
+        return doc if item_ref doc
-      #
-      # The iso.org site fails to respond sometimes. This method tries to get
-      # the response again.
-      #
-      # @param [Net::HTTPOK] resp HTTP response
-      # @param [URI::HTTPS] uri URI of the page
-      #
-      # @return [Nokogiri::HTML4::Document] document
-      # @raise [RelatonBib::RequestError] if the page could not be parsed
-      #
-      def try_if_fail(resp, uri)
-        10.times do
-          doc = Nokogiri::HTML(resp.body)
-          # stop trying if page has a document id
-          return doc if item_ref doc
-          resp = Net::HTTP.get_response(uri)
-        end
-        raise RelatonBib::RequestError, "Could not parse the page #{uri}"
+        resp = Net::HTTP.get_response(uri)
       end
+      raise RelatonBib::RequestError, "Could not parse the page #{uri}"
+    end
-      #
-      # Generate docnumber.
-      #
-      # @param [Pubid::Iso] pubid
-      #
-      # @return [String] docnumber
-      #
-      def fetch_docnumber(pubid)
-        pubid.to_s.match(/\d+/)&.to_s
-      end
+    #
+    # Generate docnumber.
+    #
+    # @param [Pubid::Iso] pubid
+    #
+    # @return [String] docnumber
+    #
+    def fetch_docnumber(pubid)
+      pubid.to_s.match(/\d+/)&.to_s
+    end
-      #
-      # Parse structuredidentifier.
-      #
-      # @param pubid [Pubid::Iso::Identifier] pubid
-      #
-      # @return [RelatonBib::StructuredIdentifier] structured identifier
-      #
-      def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
-        RelatonIsoBib::StructuredIdentifier.new(
-          project_number: "#{pubid.publisher} #{pubid.number}",
-          part: pubid.part&.to_s, # &.sub(/^-/, ""),
-          type: pubid.publisher,
-        )
-      end
+    #
+    # Parse structuredidentifier.
+    #
+    # @param pubid [Pubid::Iso::Identifier] pubid
+    #
+    # @return [RelatonBib::StructuredIdentifier] structured identifier
+    #
+    def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
+      RelatonIsoBib::StructuredIdentifier.new(
+        project_number: "#{pubid.publisher} #{pubid.number}",
+        part: pubid.part&.to_s, # &.sub(/^-/, ""),
+        type: pubid.publisher,
+      )
+    end
-      def item_ref(doc)
-        doc.at("//main//section/div/div/div//h1")&.text
-      end
+    def item_ref(doc)
+      doc.at("//main//section/div/div/div//h1")&.text
+    end
-      # Fetch status.
-      # @param doc [Nokogiri::HTML::Document]
-      # @param status [String]
-      # @return [Hash]
-      def fetch_status(doc)
-        stg, substg = stage_code(doc).split "."
-        RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
-      end
+    # Fetch status.
+    # @param doc [Nokogiri::HTML::Document]
+    # @param status [String]
+    # @return [Hash]
+    def fetch_status(doc)
+      stg, substg = stage_code(doc).split "."
+      RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
+    end
-      def stage_code(doc)
-        doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
-               "/a/span[@class='stage-code']").text
-      end
+    def stage_code(doc)
+      doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
+              "/a/span[@class='stage-code']").text
+    end
-      # def stage(stg, substg)
-      #   abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
-      #   RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
-      # end
-      # Fetch workgroup.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Hash]
-      def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
-        wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
-        return unless wg
-        workgroup = wg.text.split "/"
-        type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
-        # {
-        #   name: "International Organization for Standardization",
-        #   abbreviation: "ISO",
-        #   url: "www.iso.org",
-        # }
-        tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
-        tc_name = wg[:title]
-        tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
-                                       type: type, number: tc_numb)
-        RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
-      end
+    # def stage(stg, substg)
+    #   abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
+    #   RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
+    # end
+    # Fetch workgroup.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
+      wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
+      return unless wg
+      workgroup = wg.text.split "/"
+      type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
+      # {
+      #   name: "International Organization for Standardization",
+      #   abbreviation: "ISO",
+      #   url: "www.iso.org",
+      # }
+      tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
+      tc_name = wg[:title]
+      tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
+                                      type: type, number: tc_numb)
+      RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
+    end
-      # Fetch relations.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_relations(doc)
-        types = ["Now", "Now under review"]
-        doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
-          type, date = relation_type(r.at("h4", "h5").text.strip, doc)
-          next a if types.include?(type)
+    # Fetch relations.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_relations(doc)
+      types = ["Now", "Now under review"]
+      doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
+        type, date = relation_type(r.at("h4", "h5").text.strip, doc)
+        next a if types.include?(type)
-          a + create_relations(r, type, date)
-        end
+        a + create_relations(r, type, date)
       end
+    end
-      def relation_type(type, doc)
-        date = []
-        t = case type.strip
-            when "Previously", "Will be replaced by" then "obsoletes"
-            when "Corrigenda / Amendments", "Revised by", "Now confirmed"
-              on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
-              date << { type: "circulated", on: on.text } if on
-              "updates"
-            else type
-            end
-        [t, date]
-      end
+    def relation_type(type, doc)
+      date = []
+      t = case type.strip
+          when "Previously", "Will be replaced by" then "obsoletes"
+          when "Corrigenda / Amendments", "Revised by", "Now confirmed"
+            on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
+            date << { type: "circulated", on: on.text } if on
+            "updates"
+          else type
+          end
+      [t, date]
+    end
-      def create_relations(rel, type, date)
-        rel.css("a").map do |id|
-          docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
-          fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
-          bibitem = RelatonIsoBib::IsoBibliographicItem.new(
-            docid: [docid], formattedref: fref, date: date,
-          )
-          { type: type, bibitem: bibitem }
-        end
+    def create_relations(rel, type, date)
+      rel.css("a").map do |id|
+        docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
+        fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
+        bibitem = RelatonIsoBib::IsoBibliographicItem.new(
+          docid: [docid], formattedref: fref, date: date,
+        )
+        { type: type, bibitem: bibitem }
       end
+    end
-      # Fetch type.
-      # @param ref [String]
-      # @return [String]
-      def fetch_type(ref)
-        %r{
-          ^(?<prefix>ISO|IWA|IEC)
-          (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
-          (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
-        }x =~ ref
-        # return "international-standard" if type_match.nil?
-        if TYPES[type] then TYPES[type]
-        elsif prefix == "ISO" then "international-standard"
-        elsif prefix == "IWA" then "international-workshop-agreement"
-        end
-        # rescue => _e
-        #   puts 'Unknown document type: ' + title
+    # Fetch type.
+    # @param ref [String]
+    # @return [String]
+    def fetch_type(ref)
+      %r{
+        ^(?<prefix>ISO|IWA|IEC)
+        (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
+        (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
+      }x =~ ref
+      # return "international-standard" if type_match.nil?
+      if TYPES[type] then TYPES[type]
+      elsif prefix == "ISO" then "international-standard"
+      elsif prefix == "IWA" then "international-workshop-agreement"
       end
+      # rescue => _e
+      #   puts 'Unknown document type: ' + title
+    end
-      # Fetch titles.
-      # @param doc [Nokogiri::HTML::Document]
-      # @param lang [String]
-      # @return [Array<RelatonBib::TypedTitleString>]
-      def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
-        head = doc.at "//nav[contains(@class,'heading-condensed')]"
-        types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
-        title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
-          h[types[t.name]] = t.text
-        end
-        title = RelatonBib::TypedTitleStringCollection.new
-        title_types.each do |type, content|
-          title << RelatonBib::TypedTitleString.new(
-            type: type, content: content, language: lang, script: script(lang),
-          )
-        end
-        main = title.map { |t| t.title.content }.join " - "
-        title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
+    # Fetch titles.
+    # @param doc [Nokogiri::HTML::Document]
+    # @param lang [String]
+    # @return [Array<RelatonBib::TypedTitleString>]
+    def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
+      types = %w[title-intro title-main title-part]
+      ttls = titles(doc)
+      title = RelatonBib::TypedTitleStringCollection.new
+      ttls.each.with_index do |p, i|
+        next unless p
+        title << RelatonBib::TypedTitleString.new(
+          type: types[i], content: p, language: lang, script: script(lang),
+        )
+      end.compact
+      main = title.map { |t| t.title.content }.join " - "
+      title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
+    end
+    def titles(doc)
+      head = doc.at "//nav[contains(@class,'heading-condensed')]"
+      ttls = head.xpath("h2 | h3 | h4").map &:text
+      ttls = ttls[0].split " - " if ttls.size == 1
+      case ttls.size
+      when 0, 1 then [nil, ttls.first, nil]
+      else RelatonBib::TypedTitleString.intro_or_part ttls
       end
+    end
-      # Return ISO script code.
-      # @param lang [String]
-      # @return [String]
-      def script(lang)
-        case lang
-        when "en", "fr" then "Latn"
-          # when "ru" then "Cyrl"
-        end
+    # Return ISO script code.
+    # @param lang [String]
+    # @return [String]
+    def script(lang)
+      case lang
+      when "en", "fr" then "Latn"
+        # when "ru" then "Cyrl"
       end
+    end
-      # Fetch dates
-      # @param doc [Nokogiri::HTML::Document]
-      # @param ref [String]
-      # @return [Array<Hash>]
-      def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
-        dates = []
-        %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
-        pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
-        if ref_date_str
-          ref_date = Date.strptime ref_date_str, "%Y"
-          if pub_date_str.empty?
+    # Fetch dates
+    # @param doc [Nokogiri::HTML::Document]
+    # @param ref [String]
+    # @return [Array<Hash>]
+    def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
+      dates = []
+      %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
+      pub_date_str = doc.at("//span[@itemprop='releaseDate']")
+      if ref_date_str
+        ref_date = Date.strptime ref_date_str, "%Y"
+        if pub_date_str.nil?
+          dates << { type: "published", on: ref_date_str }
+        else
+          pub_date = Date.strptime pub_date_str.text, "%Y"
+          if pub_date.year > ref_date.year
             dates << { type: "published", on: ref_date_str }
+            dates << { type: "updated", on: pub_date_str.text }
           else
-            pub_date = Date.strptime pub_date_str, "%Y"
-            if pub_date.year > ref_date.year
-              dates << { type: "published", on: ref_date_str }
-              dates << { type: "updated", on: pub_date_str }
-            else
-              dates << { type: "published", on: pub_date_str }
-            end
+            dates << { type: "published", on: pub_date_str.text }
           end
-        elsif !pub_date_str.empty?
-          dates << { type: "published", on: pub_date_str }
         end
-        dates
+      elsif pub_date_str
+        dates << { type: "published", on: pub_date_str.text }
       end
+      corr_data = doc.at "//span[@itemprop='dateModified']"
+      dates << { type: "corrected", on: corr_data.text } if corr_data
+      dates
+    end
-      def fetch_contributors(ref)
-        ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
-          publisher = PUBLISHERS[abbrev]
-          next mem unless publisher
+    def fetch_contributors(ref)
+      ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
+        publisher = PUBLISHERS[abbrev]
+        next mem unless publisher
-          publisher[:abbreviation] = abbrev
-          mem << { entity: publisher, role: [type: "publisher"] }
-        end
+        publisher[:abbreviation] = abbrev
+        mem << { entity: publisher, role: [type: "publisher"] }
       end
+    end
-      # Fetch ICS.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_ics(doc)
-        doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
-          code = i.text.match(/[\d.]+/).to_s.split "."
-          { field: code[0], group: code[1], subgroup: code[2] }
-        end
+    # Fetch ICS.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_ics(doc)
+      doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
+        code = i.text.match(/[\d.]+/).to_s.split "."
+        { field: code[0], group: code[1], subgroup: code[2] }
       end
+    end
-      #
-      # Fetch links.
-      #
-      # @param doc [Nokogiri::HTML::Document] document to parse
-      # @param url [String] document url
-      #
-      # @return [Array<Hash>]
-      #
-      def fetch_link(doc, url)
-        links = [{ type: "src", content: url }]
-        obp = doc.at("//h4[contains(@class, 'h5')]/a")
-        links << { type: "obp", content: obp[:href] } if obp
-        rss = doc.at("//a[contains(@href, 'rss')]")
-        links << { type: "rss", content: DOMAIN + rss[:href] } if rss
-        pub = doc.at "//p[contains(., 'publicly available')]/a",
-                     "//p[contains(., 'can be downloaded from the')]/a"
-        links << { type: "pub", content: pub[:href] } if pub
-        links
-      end
+    #
+    # Fetch links.
+    #
+    # @param doc [Nokogiri::HTML::Document] document to parse
+    # @param url [String] document url
+    #
+    # @return [Array<Hash>]
+    #
+    def fetch_link(doc, url)
+      links = [{ type: "src", content: url }]
+      obp = doc.at("//h4[contains(@class, 'h5')]/a")
+      links << { type: "obp", content: obp[:href] } if obp
+      rss = doc.at("//a[contains(@href, 'rss')]")
+      links << { type: "rss", content: DOMAIN + rss[:href] } if rss
+      pub = doc.at "//p[contains(., 'publicly available')]/a",
+                    "//p[contains(., 'can be downloaded from the')]/a"
+      links << { type: "pub", content: pub[:href] } if pub
+      links
+    end
-      # Fetch copyright.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
-        ref = item_ref doc
-        owner_name = ref.match(/.*?(?=\s)/).to_s
-        from = ref.match(/(?<=:)\d{4}/).to_s
-        if from.empty?
-          date = doc.at(
-            "//span[@itemprop='releaseDate']",
-            "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
-          )
-          from = date.text.match(/\d{4}/).to_s
-        end
-        [{ owner: [{ name: owner_name }], from: from }]
+    # Fetch copyright.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
+      ref = item_ref doc
+      owner_name = ref.match(/.*?(?=\s)/).to_s
+      from = ref.match(/(?<=:)\d{4}/).to_s
+      if from.empty?
+        date = doc.at(
+          "//span[@itemprop='releaseDate']",
+          "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
+        )
+        from = date.text.match(/\d{4}/).to_s
       end
+      [{ owner: [{ name: owner_name }], from: from }]
     end
   end
 end

data/lib/relaton_iso/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module RelatonIso
-  VERSION = "1.16.2"
+  VERSION = "1.16.4"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: relaton-iso
 version: !ruby/object:Gem::Version
-  version: 1.16.2
+  version: 1.16.4
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-20 00:00:00.000000000 Z
+date: 2023-11-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: algolia