RubyGems - relaton-iso - Versions diffs - 1.18.1 → 1.18.2 - Mend

relaton-iso 1.18.1 → 1.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/lib/relaton_iso/data_fetcher.rb +200 -0
data/lib/relaton_iso/document_identifier.rb +20 -1
data/lib/relaton_iso/hash_converter.rb +15 -0
data/lib/relaton_iso/hit.rb +29 -21
data/lib/relaton_iso/hit_collection.rb +74 -59
data/lib/relaton_iso/index.rb +132 -0
data/lib/relaton_iso/iso_bibliography.rb +172 -180
data/lib/relaton_iso/processor.rb +22 -2
data/lib/relaton_iso/queue.rb +61 -0
data/lib/relaton_iso/scrapper.rb +118 -70
data/lib/relaton_iso/version.rb +1 -1
data/lib/relaton_iso.rb +5 -0
data/relaton_iso.gemspec +1 -0
metadata +20 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a483674664dc939f3ccf8b46d36e275db5426d9c164e0ec1a799ea321cb47694
-  data.tar.gz: 207870aaa9b1d89dc5dddd5922aac0b1c77c43b8bd0be292b2ffda7ccaeb712b
+  metadata.gz: 395d9fbaf99042f03785dd5b529d1a1821b8e5939a378503b6af4898901363d5
+  data.tar.gz: 4745ddb2a95d4b5dfc2290afdc2116c0ed9f113cda3c717909ca9987eec33486
 SHA512:
-  metadata.gz: 39b94b85564592a94934e449d92ebc7fe5b90d536cba83947d179791b57fda3618a47853e1776822498e2654fa910c2ddc71b126c539655794917d790bd8f56a
-  data.tar.gz: f34353074f0e0eb354b14b029726a964845fffbcf059bf6b9c5ef6e6a2cf0921dc84a25afc092c2b2da4088d3b833392c03dd20054b83195f40b16485a9ac04c
+  metadata.gz: fb0e270a99fc7a4a8cd07bf0f28d7c5d157976ffaf436523dd9ff871950726af552447f4e4eea1802eb0a59856702fd9ca95757b1b7fea3a89f69e5f83d3f876
+  data.tar.gz: 35d93ed7fdead4846059a22485ff56652b727640d754a2e4523350706966107418c3fb723eb278abaf17460f80612394dc61bf9720907294d821cacbc84e6b7f

data/lib/relaton_iso/data_fetcher.rb ADDED Viewed

@@ -0,0 +1,200 @@
+module RelatonIso
+  # Fetch all the documents from ISO website.
+  class DataFetcher
+    #
+    # Initialize data fetcher.
+    #
+    # @param [String] output output directory
+    # @param [String] format format of output files (yaml, bibxml, xml)
+    #
+    def initialize(output, format)
+      @output = output
+      @format = format
+      @ext = format.sub(/^bib/, "")
+      @files = []
+      @queue = ::Queue.new
+      @mutex = Mutex.new
+    end
+    def index
+      @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
+    end
+    def iso_queue
+      @iso_queue ||= RelatonIso::Queue.new
+    end
+    #
+    # Initialize data fetcher and fetch data.
+    #
+    # @param [String] output output directory (default: "data")
+    # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
+    #
+    # @return [void]
+    #
+    def self.fetch(output: "data", format: "yaml")
+      t1 = Time.now
+      puts "Started at: #{t1}"
+      FileUtils.mkdir_p output
+      new(output, format).fetch
+      t2 = Time.now
+      puts "Stopped at: #{t2}"
+      puts "Done in: #{(t2 - t1).round} sec."
+    end
+    #
+    # Go through all ICS and fetch all documents.
+    #
+    # @return [void]
+    #
+    def fetch # rubocop:disable Metrics/AbcSize
+      puts "Scrapping ICS pages..."
+      fetch_ics
+      puts "[#{Time.now}] Scrapping documents..."
+      fetch_docs
+      iso_queue.save
+      # index.sort! { |a, b| compare_docids a, b }
+      index.save
+    end
+    #
+    # Fetch ICS page recursively and store all the links to documents in the iso_queue.
+    #
+    # @param [String] path path to ICS page
+    #
+    def fetch_ics
+      threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
+      fetch_ics_page "/standards-catalogue/browse-by-ics.html"
+      sleep(1) until @queue.empty?
+      threads.size.times { @queue << :END }
+      threads.each(&:join)
+    end
+    def fetch_ics_page(path)
+      resp = get_redirection path
+      page = Nokogiri::HTML(resp.body)
+      page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
+        iso_queue.add_first item[:href].split("?").first
+      end
+      page.xpath("//td[@data-title='ICS']/a").each do |item|
+        @queue << item[:href]
+      end
+    end
+    #
+    # Get the page from the given path. If the page is redirected, get the
+    # page from the new path.
+    #
+    # @param [String] path path to the page
+    #
+    # @return [Net::HTTPOK] HTTP response
+    #
+    def get_redirection(path) # rubocop:disable Metrics/MethodLength
+      try = 0
+      uri = URI(Scrapper::DOMAIN + path)
+      begin
+        get_response uri
+      rescue Net::OpenTimeout, Net::ReadTimeout => e
+        try += 1
+        retry if check_try try, uri
+        warn "Error fetching #{uri}"
+        warn e.message
+      end
+    end
+    def get_response(uri)
+      resp = Net::HTTP.get_response(uri)
+      resp.code == "302" ? get_redirection(resp["location"]) : resp
+    end
+    def check_try(try, uri)
+      if try < 3
+        warn "Timeout fetching #{uri}, retrying..."
+        sleep 1
+        true
+      end
+    end
+    def fetch_docs
+      threads = Array.new(3) { thread { |path| fetch_doc(path) } }
+      iso_queue[0..10_000].each { |docpath| @queue << docpath }
+      threads.size.times { @queue << :END }
+      threads.each(&:join)
+    end
+    #
+    # Fetch document from ISO website.
+    #
+    # @param [String] docpath document page path
+    #
+    # @return [void]
+    #
+    def fetch_doc(docpath)
+      # path = docpath.sub(/\.html$/, "")
+      # hit = Hit.new({ path: docpath }, nil)
+      doc = Scrapper.parse_page docpath
+      @mutex.synchronize { save_doc doc, docpath }
+    rescue StandardError => e
+      warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
+      warn e.message
+      warn e.backtrace
+    end
+    # def compare_docids(id1, id2)
+    #   Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
+    # end
+    #
+    # save document to file.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
+    #
+    # @return [void]
+    #
+    def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
+      docid = doc.docidentifier.detect(&:primary)
+      file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
+      file = File.join @output, "#{file_name}.#{@ext}"
+      if @files.include? file
+        warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
+      else
+        @files << file
+        index.add_or_update docid.to_h, file
+        File.write file, serialize(doc), encoding: "UTF-8"
+      end
+      iso_queue.move_last docpath
+    end
+    #
+    # Serialize document to string.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
+    #
+    # @return [String] serialized document
+    #
+    def serialize(doc)
+      case @format
+      when "yaml" then doc.to_hash.to_yaml
+      when "bibxml" then doc.to_bibxml
+      when "xml" then doc.to_xml bibdata: true
+      end
+    end
+    private
+    #
+    # Create thread worker
+    #
+    # @return [Thread] thread
+    #
+    def thread
+      Thread.new do
+        while (path = @queue.pop) != :END
+          yield path
+        end
+      end
+    end
+  end
+end

data/lib/relaton_iso/document_identifier.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module RelatonIso
   class DocumentIdentifier < RelatonBib::DocumentIdentifier
-    def id
+    def id # rubocop:disable Metrics/MethodLength
       id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
       if @all_parts
         if type == "URN"
@@ -10,6 +10,12 @@ module RelatonIso
         end
       end
       type == "URN" ? @id.urn.to_s : id_str
+    rescue Pubid::Iso::Errors::NoEditionError => e
+      Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
+    end
+    def to_h
+      stringify_values(@id.to_h) if @id.respond_to? :to_h
     end
     def remove_part
@@ -23,5 +29,18 @@ module RelatonIso
     def all_parts
       @all_parts = true
     end
+    def stringify_values(hash)
+      hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
+    end
+    def stringify(val)
+      case val
+      when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
+      when Hash then stringify_values(val)
+      when Symbol then val
+      else val.to_s
+      end
+    end
   end
 end

data/lib/relaton_iso/hash_converter.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module RelatonIso
+  module HashConverter
+    include RelatonIsoBib::HashConverter
+    extend self
+    def create_docid(**args)
+      begin
+        args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
+      rescue StandardError
+        Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
+      end
+      DocumentIdentifier.new(**args)
+    end
+  end
+end

data/lib/relaton_iso/hit.rb CHANGED Viewed

@@ -4,28 +4,29 @@ module RelatonIso
   # Hit.
   class Hit < RelatonBib::Hit
     # @return [RelatonIsoBib::IsoBibliographicItem]
-    attr_writer :fetch, :pubid
+    attr_writer :fetch
+    # @return [Pubid::Iso::Identifier] pubid
+    attr_writer :pubid
     # Update edition for pubid when provided in Bibliographic Item
-    def update_edition(bibliographic_item)
-      if bibliographic_item.edition
-        # add edition to base document if available
-        if pubid.base
-          pubid.base.edition = bibliographic_item.edition.content
-        else
-          pubid.edition = bibliographic_item.edition.content
-        end
-      end
-    end
+    # def update_edition(bibliographic_item)
+    #   if bibliographic_item.edition
+    #     pubid.root.edition = bibliographic_item.edition.content
+    #   end
+    # end
     # Parse page.
     # @param lang [String, nil]
     # @return [RelatonIso::IsoBibliographicItem]
-    def fetch(lang = nil)
-      @fetch ||= Scrapper.parse_page self, lang
-      # update edition for pubid using fetched data
-      update_edition(@fetch)
-      @fetch
+    def fetch(_lang = nil)
+      @fetch ||= begin
+        url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
+        resp = Net::HTTP.get_response URI(url)
+        hash = YAML.safe_load resp.body
+        hash["fetched"] = Date.today.to_s
+        RelatonIsoBib::IsoBibliographicItem.from_hash hash
+      end
     end
     # @return [Integer]
@@ -41,11 +42,18 @@ module RelatonIso
     # @return [Pubid::Iso::Identifier]
     def pubid
-      @pubid ||= Pubid::Iso::Identifier.parse_from_title(hit[:title])
-    rescue Pubid::Iso::Errors::WrongTypeError,
-           Pubid::Iso::Errors::ParseError => e
-      Util.warn "Unable to find an identifier in: `#{hit[:title]}`."
-      Util.warn e.message
+      return @pubid if defined? @pubid
+      create_pubid hit[:id]
+    rescue StandardError
+      Util.warn "Unable to create an identifier from #{hit[:id]}"
+      @pubid = nil
+    end
+    private
+    def create_pubid(id)
+      @pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
     end
   end
 end

data/lib/relaton_iso/hit_collection.rb CHANGED Viewed

@@ -6,82 +6,97 @@ require "relaton_iso/hit"
 module RelatonIso
   # Page of hit collection.
   class HitCollection < RelatonBib::HitCollection
-    # @return [Boolean] whether the search was performed on GitHub
-    attr_reader :from_gh
+    INDEXFILE = "index-v1.yaml"
+    ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
-    # @param text [String] reference to search
-    def initialize(text)
+    # @param text [Pubid::Iso::Identifier] reference to search
+    def initialize(pubid, opts = {})
       super
-      @from_gh = text.match?(/^ISO[\s\/](?:TC\s184\/SC\s?4|IEC\sDIR\s(?:\d|IEC|JTC))/)
+      @opts = opts
     end
-    def fetch
-      @array = from_gh ? fetch_github : fetch_iso
+    # @return [Pubid::Iso::Identifier]
+    alias ref_pubid text
+    def ref_pubid_no_year
+      @ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
+    end
+    def ref_pubid_excluded
+      @ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
+    end
+    def fetch # rubocop:disable Metrics/AbcSize
+      @array = index.search do |row|
+        row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
+      end.map { |row| Hit.new row, self }
+        .sort_by! { |h| h.pubid.to_s }
+        .reverse!
       self
     end
-    # @param lang [String, NilClass]
+    def pubid_match?(id)
+      pubid = create_pubid(id)
+      return false unless pubid
+      pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
+      dir_excludings = excludings.dup
+      dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
+      pubid.exclude(*dir_excludings) == ref_pubid_excluded
+    end
+    def create_pubid(id)
+      Pubid::Iso::Identifier.create(**id)
+    rescue StandardError => e
+      Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
+      nil
+    end
+    def excludings
+      return @excludings if defined? @excludings
+      excl_parts = %i[year]
+      excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
+      if ref_pubid.stage.nil? || @opts[:all_parts]
+        excl_parts << :stage
+        excl_parts << :iteration
+      end
+      # excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
+      @escludings = excl_parts
+    end
+    def index
+      @index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
+    end
+    def fetch_doc
+      if !@opts[:all_parts] || size == 1
+        any? && first.fetch(@opts[:lang])
+      else
+        to_all_parts(@opts[:lang])
+      end
+    end
+    # @param lang [String, nil]
     # @return [RelatonIsoBib::IsoBibliographicItem, nil]
-    def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      # parts = @array.reject { |h| h.hit["docPart"]&.empty? }
+    def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
       hit = @array.min_by { |h| h.pubid.part.to_i }
       return @array.first&.fetch lang unless hit
       bibitem = hit.fetch(lang)
       all_parts_item = bibitem.to_all_parts
-      @array.reject { |h| h.hit[:uuid] == hit.hit[:uuid] }.each do |hi|
-        isobib = RelatonIsoBib::IsoBibliographicItem.new(
-          formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
-        )
-        all_parts_item.relation << RelatonBib::DocumentRelation.new(
-          type: "instanceOf", bibitem: isobib,
-        )
+      @array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
+        all_parts_item.relation << create_relation(hi)
       end
       all_parts_item
     end
-    private
-    #
-    # Fetch document from GitHub repository
-    #
-    # @return [Array<RelatonIso::Hit]
-    #
-    def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      ref = text.gsub(/[\s\/]/, "_").upcase
-      url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
-      resp = Net::HTTP.get_response URI(url)
-      return [] unless resp.code == "200"
-      hash = YAML.safe_load resp.body
-      bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
-      bib_hash[:fetched] = Date.today.to_s
-      bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
-      hit = Hit.new({ title: text }, self)
-      hit.fetch = bib
-      [hit]
-    end
-    #
-    # Fetch hits from iso.org
-    #
-    # @return [Array<RelatonIso::Hit>]
-    #
-    def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
-      client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
-      index = client.init_index "all_en"
-      resp = index.search text, hitsPerPage: 100, filters: "category:standard"
-      resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
-        if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
-          a.hit[:title] <=> b.hit[:title]
-        elsif a.sort_weight == b.sort_weight
-          b.hit[:year] - a.hit[:year]
-        else
-          a.sort_weight - b.sort_weight
-        end
-      end
+    def create_relation(hit)
+      docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
+      isobib = RelatonIsoBib::IsoBibliographicItem.new(
+        formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
+      )
+      RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
     end
   end
 end

data/lib/relaton_iso/index.rb ADDED Viewed

@@ -0,0 +1,132 @@
+module RelatonIso
+  # Index.
+  class Index
+    #
+    # Initialise index. If file path is given, read index from file. If file is not
+    # given, look for it in a `/home/USER/.relaton/iso` directory. If file
+    # doesn't exist, or is outdated then fetch index from GitHub.
+    #
+    # @param [String, nil] file path to index file.
+    #
+    def initialize(file = nil)
+      @file = file
+    end
+    #
+    # Create index.
+    #
+    # @return [Array<Hash>] index
+    #
+    def index
+      @index ||= read_index || read_from_user_dir || fetch_index
+    end
+    #
+    # Add or update index entry.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] item document
+    #
+    # @return [void]
+    #
+    def <<(item)
+      id = item.docidentifier.detect(&:primary).id
+      row = self[id] || begin
+        r = { id: id }
+        index << r
+        r
+      end
+      row[:title] = item.title.first.title.content
+    end
+    #
+    # Fetch document from index by ID.
+    #
+    # @param [String] id document ID
+    #
+    # @return [Hash] index entry
+    #
+    def [](id)
+      index.detect { |i| i[:id] == id }
+    end
+    #
+    # Save index to file.
+    #
+    # @return [void]
+    #
+    def save
+      serialize_and_save index
+    end
+    private
+    #
+    # Serialize index and save to file.
+    #
+    # @param [Array<Hash>] idx index
+    #
+    # @return [void]
+    #
+    def serialize_and_save(idx)
+      File.open(@file, "w:UTF-8") do |f|
+        f.puts "---"
+        idx.each do |i|
+          f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
+        end
+      end
+    end
+    #
+    # Read index from file. If file doesn't exist, create empty index.
+    #
+    # @return [Array<Hash>, nil] index
+    #
+    def read_index
+      if @file && File.exist?(@file) then read_file
+      elsif @file then []
+      end
+    end
+    #
+    # Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
+    # if file doesn't exist, or is outdated.
+    #
+    # @return [Array<Hash>] index
+    #
+    def read_from_user_dir
+      @file = File.join(Dir.home, "index.yml")
+      read_file if File.exist?(@file) && !outdated?
+    end
+    def read_file
+      yaml = File.read @file, encoding: "UTF-8"
+      RelatonBib.parse_yaml yaml, [], symbolize_names: true
+    end
+    #
+    # Check if index file is outdated.
+    #
+    # @return [Boolean] true if older than 24 hours
+    #
+    def outdated?
+      (Time.now - File.mtime(@file)) / 3600 > 24
+    end
+    #
+    # Fetch index from GitHub.
+    #
+    # @return [Array<Hash>] index
+    #
+    def fetch_index
+      url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
+      zip = Zip::InputStream.new URI(url).open
+      yaml = zip.get_next_entry.get_input_stream.read
+      idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
+      serialize_and_save idx
+      idx
+    rescue OpenURI::HTTPError => e
+      warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
+      []
+    end
+  end
+end