RubyGems - relaton-iso - Versions diffs - 1.16.1 → 1.16.3 - Mend

relaton-iso 1.16.1 → 1.16.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/relaton_iso/processor.rb +1 -1
data/lib/relaton_iso/scrapper.rb +380 -360
data/lib/relaton_iso/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 45b4a081a62ab5a5f0a4e6f2c2cffb4950861f09e838401a68aa2208731d65ec
-  data.tar.gz: 01521bd3e1fa7853145a390461390b7a07dfc20e1efb02c2d6d90372d03a8664
+  metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
+  data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
 SHA512:
-  metadata.gz: 0e72371e46e2d03875fce213861ab9f087fdafca4abe748436f8ccc217ee2d82b5a089c20d73062d2a022366b79294ac7b1a16f0b3f59593f79d673800286877
-  data.tar.gz: c6fa8308f8feb86cc08ae3a1fde9e267169c5bf8b5292a6c97f4dbf7f28668b56ac111e3dc03411dfa537ce83905e87a273d866b275f376cef402a3f641a59b6
+  metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
+  data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd

data/lib/relaton_iso/processor.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module RelatonIso
   class Processor < Relaton::Processor
     attr_reader :idtype
-    def initialize
+    def initialize # rubocop:disable Lint/MissingSuper
       @short = :relaton_iso
       @prefix = "ISO"
       @defaultprefix = %r{^ISO(/IEC)?\s}

data/lib/relaton_iso/scrapper.rb CHANGED Viewed

@@ -43,407 +43,427 @@ module RelatonIso
                   url: "www.asme.org" },
     }.freeze
-    class << self
-      # Parse page.
-      # @param hit [RelatonIso::Hit]
-      # @param lang [String, NilClass]
-      # @return [RelatonIsoBib::IsoBibliographicItem]
-      def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-        # path = "/contents/data/standard#{hit_data['splitPath']}/"\
-        # "#{hit_data['csnumber']}.html"
-        path = hit.hit[:path].sub("/sites/isoorg", "")
-        doc, url = get_page "#{path}.html"
-        # Fetch edition.
-        edition = doc.at("//div[div[.='Edition']]/text()[last()]")
-          &.text&.match(/\d+$/)&.to_s
-        hit.pubid.base.edition ||= edition if hit.pubid.base
-        titles, abstract, langs = fetch_titles_abstract(doc, lang)
-        RelatonIsoBib::IsoBibliographicItem.new(
-          fetched: Date.today.to_s,
-          docid: fetch_relaton_docids(doc, hit.pubid),
-          docnumber: fetch_docnumber(hit.pubid),
-          edition: edition,
-          language: langs.map { |l| l[:lang] },
-          script: langs.map { |l| script(l[:lang]) }.uniq,
-          title: titles,
-          doctype: fetch_type(hit.hit[:title]),
-          docstatus: fetch_status(doc),
-          ics: fetch_ics(doc),
-          date: fetch_dates(doc, hit.hit[:title]),
-          contributor: fetch_contributors(hit.hit[:title]),
-          editorialgroup: fetch_workgroup(doc),
-          abstract: abstract,
-          copyright: fetch_copyright(doc),
-          link: fetch_link(doc, url),
-          relation: fetch_relations(doc),
-          place: ["Geneva"],
-          structuredidentifier: fetch_structuredidentifier(hit.pubid),
-        )
-      end
+    extend self
+    # Parse page.
+    # @param hit [RelatonIso::Hit]
+    # @param lang [String, NilClass]
+    # @return [RelatonIsoBib::IsoBibliographicItem]
+    def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+      # path = "/contents/data/standard#{hit_data['splitPath']}/"\
+      # "#{hit_data['csnumber']}.html"
+      path = hit.hit[:path].sub("/sites/isoorg", "")
+      doc, url = get_page "#{path}.html"
+      # Fetch edition.
+      edition = doc.at("//div[div[.='Edition']]/text()[last()]")
+        &.text&.match(/\d+$/)&.to_s
+      hit.pubid.base.edition ||= edition if hit.pubid.base
+      titles, abstract, langs = fetch_titles_abstract(doc, lang)
+      RelatonIsoBib::IsoBibliographicItem.new(
+        fetched: Date.today.to_s,
+        docid: fetch_relaton_docids(doc, hit.pubid),
+        docnumber: fetch_docnumber(hit.pubid),
+        edition: edition,
+        language: langs.map { |l| l[:lang] },
+        script: langs.map { |l| script(l[:lang]) }.uniq,
+        title: titles,
+        doctype: fetch_type(hit.hit[:title]),
+        docstatus: fetch_status(doc),
+        ics: fetch_ics(doc),
+        date: fetch_dates(doc, hit.hit[:title]),
+        contributor: fetch_contributors(hit.hit[:title]),
+        editorialgroup: fetch_workgroup(doc),
+        abstract: abstract,
+        copyright: fetch_copyright(doc),
+        link: fetch_link(doc, url),
+        relation: fetch_relations(doc),
+        place: ["Geneva"],
+        structuredidentifier: fetch_structuredidentifier(hit.pubid),
+      )
+    end
-      #
-      # Create document ids.
-      #
-      # @param doc [Nokogiri::HTML::Document] document to parse
-      # @param pubid [Pubid::Iso::Identifier] publication identifier
-      #
-      # @return [Array<RelatonBib::DocumentIdentifier>]
-      #
-      def fetch_relaton_docids(doc, pubid)
-        pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
-        [
-          RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
-          RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
-          RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
-        ]
-      end
+    #
+    # Create document ids.
+    #
+    # @param doc [Nokogiri::HTML::Document] document to parse
+    # @param pubid [Pubid::Iso::Identifier] publication identifier
+    #
+    # @return [Array<RelatonBib::DocumentIdentifier>]
+    #
+    def fetch_relaton_docids(doc, pubid)
+      pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
+      [
+        RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
+        RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
+        RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
+      ]
+    end
-      #
-      # Create ISO reference identifier with English language.
-      #
-      # @param [Pubid::Iso::Identifier] pubid publication identifier
-      #
-      # @return [String] English reference identifier
-      #
-      def isoref(pubid)
-        params = pubid.get_params.reject { |k, _| k == :typed_stage }
-        Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
-      end
+    #
+    # Create ISO reference identifier with English language.
+    #
+    # @param [Pubid::Iso::Identifier] pubid publication identifier
+    #
+    # @return [String] English reference identifier
+    #
+    def isoref(pubid)
+      params = pubid.get_params.reject { |k, _| k == :typed_stage }
+      Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
+    end
-      private
-      # Fetch titles and abstracts.
-      # @param doc [Nokigiri::HTML::Document]
-      # @param lang [String, NilClass]
-      # @return [Array<Array>]
-      def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
-        titles   = RelatonBib::TypedTitleStringCollection.new
-        abstract = []
-        langs = languages(doc, lang).reduce([]) do |s, l|
-          # Don't need to get page for en. We already have it.
-          d = l[:path] ? get_page(l[:path])[0] : doc
-          unless d.at("//h5[@class='help-block']" \
-                      "[.='недоступно на русском языке']")
-            s << l
-            titles += fetch_title(d, l[:lang])
-            # Fetch abstracts.
-            abstract_content = d.xpath(
-              "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
-            ).map do |a|
-              a.name == "li" ? "- #{a.text}" : a.text
-            end.reject(&:empty?).join("\n")
-            unless abstract_content.empty?
-              abstract << {
-                content: abstract_content,
-                language: l[:lang],
-                script: script(l[:lang]),
-                format: "text/plain",
-              }
-            end
+    private
+    # Fetch titles and abstracts.
+    # @param doc [Nokigiri::HTML::Document]
+    # @param lang [String, NilClass]
+    # @return [Array<Array>]
+    def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
+      titles   = RelatonBib::TypedTitleStringCollection.new
+      abstract = []
+      langs = languages(doc, lang).reduce([]) do |s, l|
+        # Don't need to get page for en. We already have it.
+        d = l[:path] ? get_page(l[:path])[0] : doc
+        unless d.at("//h5[@class='help-block']" \
+                    "[.='недоступно на русском языке']")
+          s << l
+          titles += fetch_title(d, l[:lang])
+          # Fetch abstracts.
+          abstract_content = d.xpath(
+            "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
+          ).map do |a|
+            a.name == "li" ? "- #{a.text}" : a.text
+          end.reject(&:empty?).join("\n")
+          unless abstract_content.empty?
+            abstract << {
+              content: abstract_content,
+              language: l[:lang],
+              script: script(l[:lang]),
+              format: "text/plain",
+            }
           end
-          s
         end
-        [titles, abstract, langs]
+        s
       end
+      [titles, abstract, langs]
+    end
-      # Returns available languages.
-      # @param doc [Nokogiri::HTML::Document]
-      # @pqrqm lang [String, NilClass]
-      # @return [Array<Hash>]
-      def languages(doc, lang)
-        lgs = [{ lang: "en" }]
-        doc.css("li#lang-switcher ul li a").each do |lang_link|
-          lang_path = lang_link.attr("href")
-          l = lang_path.match(%r{^/(fr)/})
-          lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
-        end
-        lgs
+    # Returns available languages.
+    # @param doc [Nokogiri::HTML::Document]
+    # @pqrqm lang [String, NilClass]
+    # @return [Array<Hash>]
+    def languages(doc, lang)
+      lgs = [{ lang: "en" }]
+      doc.css("li#lang-switcher ul li a").each do |lang_link|
+        lang_path = lang_link.attr("href")
+        l = lang_path.match(%r{^/(fr)/})
+        lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
       end
+      lgs
+    end
-      # Get page.
-      # @param path [String] page's path
-      # @return [Array<Nokogiri::HTML::Document, String>]
-      def get_page(path)
-        resp, uri = get_redirection path
-        doc = try_if_fail resp, uri
-        [doc, uri.to_s]
-      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
-             EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
-             Net::ProtocolError, Errno::ETIMEDOUT
-        raise RelatonBib::RequestError, "Could not access #{uri}"
-      end
+    # Get page.
+    # @param path [String] page's path
+    # @return [Array<Nokogiri::HTML::Document, String>]
+    def get_page(path)
+      resp, uri = get_redirection path
+      doc = try_if_fail resp, uri
+      [doc, uri.to_s]
+    rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
+            EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
+            Net::ProtocolError, Errno::ETIMEDOUT
+      raise RelatonBib::RequestError, "Could not access #{uri}"
+    end
-      #
-      # Get the page from the given path. If the page is redirected, get the
-      # page from the new path.
-      #
-      # @param [String] path path to the page
-      #
-      # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
-      # @raise [RelatonBib::RequestError] if the page is not found
-      #
-      def get_redirection(path)
-        url = DOMAIN + path
-        uri = URI url
-        resp = Net::HTTP.get_response(uri)
-        raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
+    #
+    # Get the page from the given path. If the page is redirected, get the
+    # page from the new path.
+    #
+    # @param [String] path path to the page
+    #
+    # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
+    # @raise [RelatonBib::RequestError] if the page is not found
+    #
+    def get_redirection(path)
+      url = DOMAIN + path
+      uri = URI url
+      resp = Net::HTTP.get_response(uri)
+      raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
+      resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
+    end
-        resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
-      end
+    #
+    # The iso.org site fails to respond sometimes. This method tries to get
+    # the response again.
+    #
+    # @param [Net::HTTPOK] resp HTTP response
+    # @param [URI::HTTPS] uri URI of the page
+    #
+    # @return [Nokogiri::HTML4::Document] document
+    # @raise [RelatonBib::RequestError] if the page could not be parsed
+    #
+    def try_if_fail(resp, uri)
+      10.times do
+        doc = Nokogiri::HTML(resp.body)
+        # stop trying if page has a document id
+        return doc if item_ref doc
-      #
-      # The iso.org site fails to respond sometimes. This method tries to get
-      # the response again.
-      #
-      # @param [Net::HTTPOK] resp HTTP response
-      # @param [URI::HTTPS] uri URI of the page
-      #
-      # @return [Nokogiri::HTML4::Document] document
-      # @raise [RelatonBib::RequestError] if the page could not be parsed
-      #
-      def try_if_fail(resp, uri)
-        10.times do
-          doc = Nokogiri::HTML(resp.body)
-          # stop trying if page has a document id
-          return doc if item_ref doc
-          resp = Net::HTTP.get_response(uri)
-        end
-        raise RelatonBib::RequestError, "Could not parse the page #{uri}"
+        resp = Net::HTTP.get_response(uri)
       end
+      raise RelatonBib::RequestError, "Could not parse the page #{uri}"
+    end
-      #
-      # Generate docnumber.
-      #
-      # @param [Pubid::Iso] pubid
-      #
-      # @return [String] docnumber
-      #
-      def fetch_docnumber(pubid)
-        pubid.to_s.match(/\d+/)&.to_s
-      end
+    #
+    # Generate docnumber.
+    #
+    # @param [Pubid::Iso] pubid
+    #
+    # @return [String] docnumber
+    #
+    def fetch_docnumber(pubid)
+      pubid.to_s.match(/\d+/)&.to_s
+    end
-      #
-      # Parse structuredidentifier.
-      #
-      # @param pubid [Pubid::Iso::Identifier] pubid
-      #
-      # @return [RelatonBib::StructuredIdentifier] structured identifier
-      #
-      def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
-        RelatonIsoBib::StructuredIdentifier.new(
-          project_number: "#{pubid.publisher} #{pubid.number}",
-          part: pubid.part&.to_s, # &.sub(/^-/, ""),
-          type: pubid.publisher,
-        )
-      end
+    #
+    # Parse structuredidentifier.
+    #
+    # @param pubid [Pubid::Iso::Identifier] pubid
+    #
+    # @return [RelatonBib::StructuredIdentifier] structured identifier
+    #
+    def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
+      RelatonIsoBib::StructuredIdentifier.new(
+        project_number: "#{pubid.publisher} #{pubid.number}",
+        part: pubid.part&.to_s, # &.sub(/^-/, ""),
+        type: pubid.publisher,
+      )
+    end
-      def item_ref(doc)
-        doc.at("//main//section/div/div/div//h1")&.text
-      end
+    def item_ref(doc)
+      doc.at("//main//section/div/div/div//h1")&.text
+    end
-      # Fetch status.
-      # @param doc [Nokogiri::HTML::Document]
-      # @param status [String]
-      # @return [Hash]
-      def fetch_status(doc)
-        stg, substg = stage_code(doc).split "."
-        RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
-      end
+    # Fetch status.
+    # @param doc [Nokogiri::HTML::Document]
+    # @param status [String]
+    # @return [Hash]
+    def fetch_status(doc)
+      stg, substg = stage_code(doc).split "."
+      RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
+    end
-      def stage_code(doc)
-        doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
-               "/a/span[@class='stage-code']").text
-      end
+    def stage_code(doc)
+      doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
+              "/a/span[@class='stage-code']").text
+    end
-      # def stage(stg, substg)
-      #   abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
-      #   RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
-      # end
-      # Fetch workgroup.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Hash]
-      def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
-        wg = doc.at("//div[@class='clearfix']")
-        wg_link = wg.at "span/a"
-        return unless wg_link
-        workgroup = wg_link.text.split "/"
-        type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
-        # {
-        #   name: "International Organization for Standardization",
-        #   abbreviation: "ISO",
-        #   url: "www.iso.org",
-        # }
-        tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
-        tc_name = wg.at("span[@class='entry-title']").text
-        tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg_link.text,
-                                       type: type, number: tc_numb)
-        RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
+    # def stage(stg, substg)
+    #   abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
+    #   RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
+    # end
+    # Fetch workgroup.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
+      wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
+      return unless wg
+      workgroup = wg.text.split "/"
+      type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
+      # {
+      #   name: "International Organization for Standardization",
+      #   abbreviation: "ISO",
+      #   url: "www.iso.org",
+      # }
+      tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
+      tc_name = wg[:title]
+      tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
+                                      type: type, number: tc_numb)
+      RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
+    end
+    # Fetch relations.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_relations(doc)
+      types = ["Now", "Now under review"]
+      doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
+        type, date = relation_type(r.at("h4", "h5").text.strip, doc)
+        next a if types.include?(type)
+        a + create_relations(r, type, date)
       end
+    end
-      # rubocop:disable Metrics/MethodLength
-      # Fetch relations.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_relations(doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
-        types = ["Now", "Now under review"]
-        doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
-          r_type = r.at("h4", "h5").text
-          date = []
-          type = case r_type.strip
-                 when "Previously", "Will be replaced by" then "obsoletes"
-                 when "Corrigenda / Amendments", "Revised by", "Now confirmed"
-                   on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
-                   date << { type: "circulated", on: on.text } if on
-                   "updates"
-                 else r_type
-                 end
-          if types.include?(type) then a
-          else
-            a + r.css("a").map do |id|
-              docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
-              fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
-              bibitem = RelatonIsoBib::IsoBibliographicItem.new(
-                docid: [docid], formattedref: fref, date: date,
-              )
-              { type: type, bibitem: bibitem }
-            end
+    def relation_type(type, doc)
+      date = []
+      t = case type.strip
+          when "Previously", "Will be replaced by" then "obsoletes"
+          when "Corrigenda / Amendments", "Revised by", "Now confirmed"
+            on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
+            date << { type: "circulated", on: on.text } if on
+            "updates"
+          else type
           end
-        end
+      [t, date]
+    end
+    def create_relations(rel, type, date)
+      rel.css("a").map do |id|
+        docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
+        fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
+        bibitem = RelatonIsoBib::IsoBibliographicItem.new(
+          docid: [docid], formattedref: fref, date: date,
+        )
+        { type: type, bibitem: bibitem }
       end
-      # rubocop:enable Metrics/MethodLength
-      # Fetch type.
-      # @param ref [String]
-      # @return [String]
-      def fetch_type(ref)
-        %r{
-          ^(?<prefix>ISO|IWA|IEC)
-          (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
-          (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
-        }x =~ ref
-        # return "international-standard" if type_match.nil?
-        if TYPES[type] then TYPES[type]
-        elsif prefix == "ISO" then "international-standard"
-        elsif prefix == "IWA" then "international-workshop-agreement"
-        end
-        # rescue => _e
-        #   puts 'Unknown document type: ' + title
+    end
+    # Fetch type.
+    # @param ref [String]
+    # @return [String]
+    def fetch_type(ref)
+      %r{
+        ^(?<prefix>ISO|IWA|IEC)
+        (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
+        (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
+      }x =~ ref
+      # return "international-standard" if type_match.nil?
+      if TYPES[type] then TYPES[type]
+      elsif prefix == "ISO" then "international-standard"
+      elsif prefix == "IWA" then "international-workshop-agreement"
       end
+      # rescue => _e
+      #   puts 'Unknown document type: ' + title
+    end
-      # Fetch titles.
-      # @param doc [Nokogiri::HTML::Document]
-      # @param lang [String]
-      # @return [Array<RelatonBib::TypedTitleString>]
-      def fetch_title(doc, lang)
-        content = doc.at(
-          "//nav[contains(@class,'heading-condensed')]/h2 | "\
-          "//nav[contains(@class,'heading-condensed')]/h3",
-        )&.text&.gsub(/\u2014/, "-")
-        return RelatonBib::TypedTitleStringCollection.new unless content
-        RelatonBib::TypedTitleString.from_string content, lang, script(lang)
+    # Fetch titles.
+    # @param doc [Nokogiri::HTML::Document]
+    # @param lang [String]
+    # @return [Array<RelatonBib::TypedTitleString>]
+    def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
+      types = %w[title-intro title-main title-part]
+      ttls = titles(doc)
+      title = RelatonBib::TypedTitleStringCollection.new
+      ttls.each.with_index do |p, i|
+        next unless p
+        title << RelatonBib::TypedTitleString.new(
+          type: types[i], content: p, language: lang, script: script(lang),
+        )
+      end.compact
+      main = title.map { |t| t.title.content }.join " - "
+      title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
+    end
+    def titles(doc)
+      head = doc.at "//nav[contains(@class,'heading-condensed')]"
+      ttls = head.xpath("h2 | h3 | h4").map &:text
+      ttls = ttls[0].split " - " if ttls.size == 1
+      case ttls.size
+      when 0, 1 then [nil, ttls.first, nil]
+      else RelatonBib::TypedTitleString.intro_or_part ttls
       end
+    end
-      # Return ISO script code.
-      # @param lang [String]
-      # @return [String]
-      def script(lang)
-        case lang
-        when "en", "fr" then "Latn"
-          # when "ru" then "Cyrl"
-        end
+    # Return ISO script code.
+    # @param lang [String]
+    # @return [String]
+    def script(lang)
+      case lang
+      when "en", "fr" then "Latn"
+        # when "ru" then "Cyrl"
       end
+    end
-      # rubocop:disable Metrics/MethodLength
-      # Fetch dates
-      # @param doc [Nokogiri::HTML::Document]
-      # @param ref [String]
-      # @return [Array<Hash>]
-      def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
-        dates = []
-        %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
-        pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
-        if ref_date_str
-          ref_date = Date.strptime ref_date_str, "%Y"
-          if pub_date_str.empty?
+    # Fetch dates
+    # @param doc [Nokogiri::HTML::Document]
+    # @param ref [String]
+    # @return [Array<Hash>]
+    def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
+      dates = []
+      %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
+      pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
+      if ref_date_str
+        ref_date = Date.strptime ref_date_str, "%Y"
+        if pub_date_str.empty?
+          dates << { type: "published", on: ref_date_str }
+        else
+          pub_date = Date.strptime pub_date_str, "%Y"
+          if pub_date.year > ref_date.year
             dates << { type: "published", on: ref_date_str }
+            dates << { type: "updated", on: pub_date_str }
           else
-            pub_date = Date.strptime pub_date_str, "%Y"
-            if pub_date.year > ref_date.year
-              dates << { type: "published", on: ref_date_str }
-              dates << { type: "updated", on: pub_date_str }
-            else
-              dates << { type: "published", on: pub_date_str }
-            end
+            dates << { type: "published", on: pub_date_str }
           end
-        elsif !pub_date_str.empty?
-          dates << { type: "published", on: pub_date_str }
         end
-        dates
+      elsif !pub_date_str.empty?
+        dates << { type: "published", on: pub_date_str }
       end
+      dates
+    end
-      def fetch_contributors(ref)
-        ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
-          publisher = PUBLISHERS[abbrev]
-          next mem unless publisher
+    def fetch_contributors(ref)
+      ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
+        publisher = PUBLISHERS[abbrev]
+        next mem unless publisher
-          publisher[:abbreviation] = abbrev
-          mem << { entity: publisher, role: [type: "publisher"] }
-        end
-      end
-      # rubocop:enable Metrics/MethodLength
-      # Fetch ICS.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_ics(doc)
-        doc.xpath("//dl[dt/strong[.='ICS']]/dd/span/a").map do |i|
-          code = i.text.match(/[\d.]+/).to_s.split "."
-          { field: code[0], group: code[1], subgroup: code[2] }
-        end
+        publisher[:abbreviation] = abbrev
+        mem << { entity: publisher, role: [type: "publisher"] }
       end
+    end
-      # Fetch links.
-      # @param doc [Nokogiri::HTML::Document]
-      # @param url [String]
-      # @return [Array<Hash>]
-      def fetch_link(doc, url)
-        links = [{ type: "src", content: url }]
-        obp = doc.at_css("a#obp-preview")
-        links << { type: "obp", content: obp[:href] } if obp
-        rss = doc.at("//a[contains(@href, 'rss')]")
-        links << { type: "rss", content: DOMAIN + rss[:href] } if rss
-        pub = doc.at "//p[contains(., 'publicly available')]/a",
-                     "//p[contains(., 'can be downloaded from the')]/a"
-        links << { type: "pub", content: pub[:href] } if pub
-        links
+    # Fetch ICS.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_ics(doc)
+      doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
+        code = i.text.match(/[\d.]+/).to_s.split "."
+        { field: code[0], group: code[1], subgroup: code[2] }
       end
+    end
-      # Fetch copyright.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
-        ref = item_ref doc
-        owner_name = ref.match(/.*?(?=\s)/).to_s
-        from = ref.match(/(?<=:)\d{4}/).to_s
-        if from.empty?
-          date = doc.at(
-            "//span[@itemprop='releaseDate']",
-            "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
-          )
-          from = date.text.match(/\d{4}/).to_s
-        end
-        [{ owner: [{ name: owner_name }], from: from }]
+    #
+    # Fetch links.
+    #
+    # @param doc [Nokogiri::HTML::Document] document to parse
+    # @param url [String] document url
+    #
+    # @return [Array<Hash>]
+    #
+    def fetch_link(doc, url)
+      links = [{ type: "src", content: url }]
+      obp = doc.at("//h4[contains(@class, 'h5')]/a")
+      links << { type: "obp", content: obp[:href] } if obp
+      rss = doc.at("//a[contains(@href, 'rss')]")
+      links << { type: "rss", content: DOMAIN + rss[:href] } if rss
+      pub = doc.at "//p[contains(., 'publicly available')]/a",
+                    "//p[contains(., 'can be downloaded from the')]/a"
+      links << { type: "pub", content: pub[:href] } if pub
+      links
+    end
+    # Fetch copyright.
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
+      ref = item_ref doc
+      owner_name = ref.match(/.*?(?=\s)/).to_s
+      from = ref.match(/(?<=:)\d{4}/).to_s
+      if from.empty?
+        date = doc.at(
+          "//span[@itemprop='releaseDate']",
+          "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
+        )
+        from = date.text.match(/\d{4}/).to_s
       end
+      [{ owner: [{ name: owner_name }], from: from }]
     end
   end
 end

data/lib/relaton_iso/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module RelatonIso
-  VERSION = "1.16.1"
+  VERSION = "1.16.3"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: relaton-iso
 version: !ruby/object:Gem::Version
-  version: 1.16.1
+  version: 1.16.3
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-14 00:00:00.000000000 Z
+date: 2023-10-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: algolia