RubyGems - relaton-itu - Versions diffs - 1.0.0 → 1.0.1 - Mend

relaton-itu 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/relaton_itu.rb +0 -5
data/lib/relaton_itu/hit.rb +2 -2
data/lib/relaton_itu/hit_collection.rb +38 -12
data/lib/relaton_itu/itu_bibliography.rb +22 -20
data/lib/relaton_itu/scrapper.rb +49 -77
data/lib/relaton_itu/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cbdb4a303ff1173ad845aa15185093b9f93caaf1b3dfce4cde8dae9094bf8def
-  data.tar.gz: 8a096c17b4b6863596996556eb4319ee9451b6dadbf850c78578abb40424c380
+  metadata.gz: 169f68ca9de0a9e01f2130919807393d53507a2fd92f98a7043337ee9c037a18
+  data.tar.gz: a5fc6f91b1d6c6af3b25919c77f2a3f1cc57bbc527983207d5163088ba01ea6c
 SHA512:
-  metadata.gz: ec3aed1ca2c9ba554edd6990df967369104f23efa8286ca529f0d0baffefb753c50c3b54d9030d3b65f34efcbeb096bf4674cc435e8755b5584d38e6ab485d36
-  data.tar.gz: 18baf5f6f7b7b3473af763b286794d009350995b99a7cc7a76f343f360e5098a9ea795cda8eac36bf105328d02e88b730571e13bb793fc0ad60250979d67dbe3
+  metadata.gz: 3243b1b99b363cb8bb773320ad5c70f0e48de661b4db92c1e3a96991de163ff40dd6179f974ebd80507ad2b37aca4ad0e20ca549c880787752128517a4dd34a9
+  data.tar.gz: d40ffddd2ef5a4ec569f92ae6253148b85ae434b38fa31ec2eb27681bf3e948096137b0acf86535fade15e4ccc79e538668a6371dbb575416a8a842739cdcf97

data/lib/relaton_itu.rb CHANGED

@@ -2,11 +2,6 @@ require "relaton_itu/version"
 require "relaton_itu/itu_bibliography"
 require "digest/md5"
-# if defined? Relaton
-#   require_relative "relaton/processor"
-#   Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
-# end
 module RelatonItu
   class Error < StandardError; end

data/lib/relaton_itu/hit.rb CHANGED

@@ -4,9 +4,9 @@ module RelatonItu
   # Hit.
   class Hit < RelatonBib::Hit
     # Parse page.
-    # @return [Isobib::IsoBibliographicItem]
+    # @return [RelatonItu::ItuBibliographicItem]
     def fetch
-      @fetch ||= Scrapper.parse_page @hit
+      @fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
     end
   end
 end

data/lib/relaton_itu/hit_collection.rb CHANGED

@@ -7,16 +7,39 @@ require "net/http"
 module RelatonItu
   # Page of hit collection.
   class HitCollection < RelatonBib::HitCollection
-    DOMAIN = "https://www.itu.int".freeze
+    DOMAIN = "https://www.itu.int"
-    # @param ref_nbr [String]
+    # @return [TrueClass, FalseClass]
+    attr_reader :gi_imp
+    # @param ref [String]
     # @param year [String]
-    def initialize(ref_nbr, year = nil)
-      super
-      group = %r{(OB|Operational Bulletin) No} =~ text ? "Publications" : "Recommendations"
-      url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
-      params = {
-        "Input" => ref_nbr,
+    def initialize(ref, year = nil)
+      text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
+      super text, year
+      @gi_imp = /\.Imp\d/.match?(ref)
+      uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
+      data = { json: params.to_json }
+      resp = Net::HTTP.post(uri, data.to_json,
+                            "Content-Type" => "application/json")
+      @array = hits JSON.parse(resp.body)
+    end
+    private
+    # @return [String]
+    def group
+      @group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
+                 else "Recommendations"
+                 end
+    end
+    # rubocop:disable Metrics/MethodLength
+    # @return [Hash]
+    def params
+      {
+        "Input" => text,
         "Start" => 0,
         "Rows" => 10,
         "SortBy" => "RELEVANCE",
@@ -61,10 +84,13 @@ module RelatonItu
         "IP" => "",
         "SearchType" => "All",
       }
-      data = { json: params.to_json }
-      resp  = Net::HTTP.post(URI(url), data.to_json, "Content-Type" => "application/json")
-      doc = JSON.parse resp.body
-      @array = doc["results"].map do |h|
+    end
+    # rubocop:enable Metrics/MethodLength
+    # @param data [Hash]
+    # @return [Array<RelatonItu::Hit>]
+    def hits(data)
+      data["results"].map do |h|
         code  = h["Media"]["Name"]
         title = h["Title"]
         url   = h["Redirection"]

data/lib/relaton_itu/itu_bibliography.rb CHANGED

@@ -19,9 +19,9 @@ module RelatonItu
       # @return [RelatonItu::HitCollection]
       def search(text, year = nil)
         HitCollection.new text, year
-      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
-             Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
-             OpenSSL::SSL::SSLError
+      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
+             EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
+             Net::ProtocolError, OpenSSL::SSL::SSLError
         raise RelatonBib::RequestError, "Could not access http://www.itu.int"
       end
@@ -66,17 +66,17 @@ module RelatonItu
         nil
       end
-      def fetch_pages(hits, threads)
-        workers = RelatonBib::WorkersPool.new threads
-        workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
-        hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
-        workers.end
-        workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
-      end
+      # def fetch_pages(hits, threads)
+      #   workers = RelatonBib::WorkersPool.new threads
+      #   workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
+      #   hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
+      #   workers.end
+      #   workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
+      # end
       def search_filter(code)
-        docidrx = %r{\w+.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
-        c = code.match(docidrx).to_s
+        docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
+        c = code.sub(/Imp\s?/, "").match(docidrx).to_s
         warn "[relaton-itu] (\"#{code}\") fetching..."
         result = search(code)
         result.select do |i|
@@ -93,16 +93,18 @@ module RelatonItu
       # If no match, returns any years which caused mismatch, for error reporting
       def isobib_results_filter(result, year)
         missed_years = []
-        result.each_slice(3) do |s| # ISO website only allows 3 connections
-          fetch_pages(s, 3).each do |r|
-            return { ret: r } if !year
+        # result.each_slice(3) do |s| # ISO website only allows 3 connections
+        #   fetch_pages(s, 3).each do |r|
+        result.each do |r|
+          return { ret: r.fetch } if !year
-            r.date.select { |d| d.type == "published" }.each do |d|
-              return { ret: r } if year.to_i == d.on.year
+          /\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
+          # r.date.select { |d| d.type == "published" }.each do |d|
+          return { ret: r.fetch } if year == pyear
-              missed_years << d.on.year
-            end
-          end
+          missed_years << pyear
+          # end
+          # end
         end
         { years: missed_years }
       end

data/lib/relaton_itu/scrapper.rb CHANGED

@@ -3,16 +3,9 @@
 require "nokogiri"
 require "net/http"
-# Capybara.request_driver :poltergeist do |app|
-#   Capybara::Poltergeist::Driver.new app, js_errors: false
-# end
-# Capybara.default_driver = :poltergeist
 module RelatonItu
   # Scrapper.
-  # rubocop:disable Metrics/ModuleLength
   module Scrapper
-    DOMAIN = "https://www.itu.int"
     ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
     TYPES = {
@@ -31,24 +24,19 @@ module RelatonItu
     }.freeze
     class << self
-      # @param text [String]
-      # @return [Array<Hash>]
-      # def get(text)
-      #   iso_workers = WorkersPool.new 4
-      #   iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
-      #   algolia_workers = start_algolia_search(text, iso_workers)
-      #   iso_docs = iso_workers.result
-      #   algolia_workers.end
-      #   algolia_workers.result
-      #   iso_docs
-      # end
+      # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
       # Parse page.
-      # @param hit [Hash]
+      # @param hit_data [Hash]
       # @return [Hash]
-      # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-      def parse_page(hit_data)
+      def parse_page(hit_data, imp = false)
         url, doc = get_page hit_data[:url]
+        if imp
+          a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
+          return unless a
+          url, doc = get_page URI.join(url, a[:href]).to_s
+        end
         # Fetch edition.
         edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
@@ -73,7 +61,7 @@ module RelatonItu
           place: ["Geneva"],
         )
       end
-      # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+      # rubocop:enable Metrics/AbcSize
       private
@@ -96,37 +84,23 @@ module RelatonItu
         }]
       end
-      # Get langs.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      # def langs(doc)
-      #   lgs = [{ lang: 'en' }]
-      #   doc.css('ul#lang-switcher ul li a').each do |lang_link|
-      #     lang_path = lang_link.attr('href')
-      #     lang = lang_path.match(%r{^\/(fr)\/})
-      #     lgs << { lang: lang[1], path: lang_path } if lang
-      #   end
-      #   lgs
-      # end
-      # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
       # Get page.
       # @param path [String] page's path
-      # @return [Array<Nokogiri::HTML::Document, String>]
+      # @return [Array<String, Nokogiri::HTML::Document>]
       def get_page(url)
         uri = URI url
-        resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
+        resp = Net::HTTP.get_response(uri)
         until resp.code == "200"
           uri = URI resp["location"] if resp.code =~ /^30/
-          resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
+          resp = Net::HTTP.get_response(uri)
         end
         [uri.to_s, Nokogiri::HTML(resp.body)]
-      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
-             Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
-             OpenSSL::SSL::SSLError
+      rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
+             EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
+             Net::ProtocolError, OpenSSL::SSL::SSLError
         raise RelatonBib::RequestError, "Could not access #{url}"
       end
-      # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+      # rubocop:enable Metrics/MethodLength
       # Fetch docid.
       # @param doc [Nokogiri::HTML::Document]
@@ -135,9 +109,11 @@ module RelatonItu
         doc.xpath(
           "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
           "//td[.='Identical standard:']/following-sibling::td",
+          "//div/table[1]/tr[4]/td/strong",
         ).map do |code|
-          id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
+          id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
           type = id.match(%r{^\w+}).to_s
+          type = "ITU" if type == "G"
           RelatonBib::DocumentIdentifier.new(type: type, id: id)
         end
       end
@@ -146,10 +122,11 @@ module RelatonItu
       # @param doc [Nokogiri::HTML::Document]
       # @return [RelatonBib::DocumentStatus, NilClass]
       def fetch_status(doc)
-        s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]")
+        s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
+                   "//p[contains(.,'Status :')]")
         return unless s
-        status = s.text == "In force" ? "Published" : "Withdrawal"
+        status = s.text.include?("In force") ? "Published" : "Withdrawal"
         RelatonBib::DocumentStatus.new(stage: status)
       end
@@ -191,9 +168,7 @@ module RelatonItu
       # @return [Array<Hash>]
       def fetch_relations(doc)
         doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
-          # r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
           ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
-          # url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
           fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
           bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
           { type: "complements", bibitem: bibitem }
@@ -201,22 +176,14 @@ module RelatonItu
       end
       # rubocop:enable Metrics/MethodLength
-      # Fetch type.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [String]
-      # def fetch_type(_doc)
-      #   "recommendation"
-      # end
       # Fetch titles.
       # @param doc [Nokogiri::HTML::Document]
       # @return [Array<Hash>]
       def fetch_titles(doc)
-        # t = hit_data[:title].match(%r{(?<=\(\d{2}\/\d{4}\): ).*}).to_s
-        # t = hit_data[:title] if t.empty?
-        t = doc.at("//td[@class='title']")
+        t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
         return [] unless t
-        titles = t.text.split " - "
+        titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
         case titles.size
         when 0
           intro, main, part = nil, "", nil
@@ -247,10 +214,11 @@ module RelatonItu
       # @return [Array<Hash>]
       def fetch_dates(doc)
         dates = []
-        pdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
-        publish_date = pdate&.text || ob_date(doc)
-        if publish_date && !publish_date&.empty?
-          dates << { type: "published", on: publish_date }
+        date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
+                      "//p[contains(.,'Approved in')]")
+        pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
+        if pdate && !pdate&.empty?
+          dates << { type: "published", on: pdate }
         end
         dates
       end
@@ -278,36 +246,41 @@ module RelatonItu
       # @param doc [Nokogiri::HTML::Document]
       # @return [Array<Hash>]
       def fetch_contributors(code)
+        return [] unless code
         abbrev = code.sub(/-\w\s.*/, "")
         case abbrev
         when "ITU"
           name = "International Telecommunication Union"
           url = "www.itu.int"
         end
-        [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
+        [{ entity: { name: name, url: url, abbreviation: abbrev },
+           role: [type: "publisher"] }]
       end
-      # Fetch ICS.
-      # @param doc [Nokogiri::HTML::Document]
-      # @return [Array<Hash>]
-      # def fetch_ics(doc)
-      #   doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
-      #     code = i.text.match(/[\d\.]+/).to_s.split '.'
-      #     { field: code[0], group: code[1], subgroup: code[2] }
-      #   end
-      # end
       # Fetch links.
       # @param doc [Nokogiri::HTML::Document]
       # @param url [String]
       # @return [Array<Hash>]
       def fetch_link(doc, url)
         links = [{ type: "src", content: url }]
-        obp_elms = doc.at('//a[@title="Persistent link to download the PDF file"]')
-        links << { type: "obp", content: DOMAIN + obp_elms[:href].strip } if obp_elms
+        obp_elm = doc.at(
+          '//a[@title="Persistent link to download the PDF file"]',
+          "//font[contains(.,'PDF')]/../..",
+        )
+        links << typed_link("obp", obp_elm) if obp_elm
+        wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
+        links << typed_link("word", wrd_elm) if wrd_elm
         links
       end
+      def typed_link(type, elm)
+        {
+          type: type,
+          content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
+        }
+      end
       # Fetch copyright.
       # @param code [String]
       # @param doc [Nokogiri::HTML::Document]
@@ -325,5 +298,4 @@ module RelatonItu
       end
     end
   end
-  # rubocop:enable Metrics/ModuleLength
 end

data/lib/relaton_itu/version.rb CHANGED

@@ -1,3 +1,3 @@
 module RelatonItu
-  VERSION = "1.0.0".freeze
+  VERSION = "1.0.1".freeze
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: relaton-itu
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-04-25 00:00:00.000000000 Z
+date: 2020-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: debase