RubyGems - relaton-iso - Versions diffs - 1.18.1 → 1.18.2 - Mend

relaton-iso 1.18.1 → 1.18.2

Files changed (15) hide show

checksums.yaml +4 -4
data/lib/relaton_iso/data_fetcher.rb +200 -0
data/lib/relaton_iso/document_identifier.rb +20 -1
data/lib/relaton_iso/hash_converter.rb +15 -0
data/lib/relaton_iso/hit.rb +29 -21
data/lib/relaton_iso/hit_collection.rb +74 -59
data/lib/relaton_iso/index.rb +132 -0
data/lib/relaton_iso/iso_bibliography.rb +172 -180
data/lib/relaton_iso/processor.rb +22 -2
data/lib/relaton_iso/queue.rb +61 -0
data/lib/relaton_iso/scrapper.rb +118 -70
data/lib/relaton_iso/version.rb +1 -1
data/lib/relaton_iso.rb +5 -0
data/relaton_iso.gemspec +1 -0
metadata +20 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a483674664dc939f3ccf8b46d36e275db5426d9c164e0ec1a799ea321cb47694
-  data.tar.gz: 207870aaa9b1d89dc5dddd5922aac0b1c77c43b8bd0be292b2ffda7ccaeb712b
+  metadata.gz: 395d9fbaf99042f03785dd5b529d1a1821b8e5939a378503b6af4898901363d5
+  data.tar.gz: 4745ddb2a95d4b5dfc2290afdc2116c0ed9f113cda3c717909ca9987eec33486
 SHA512:
-  metadata.gz: 39b94b85564592a94934e449d92ebc7fe5b90d536cba83947d179791b57fda3618a47853e1776822498e2654fa910c2ddc71b126c539655794917d790bd8f56a
-  data.tar.gz: f34353074f0e0eb354b14b029726a964845fffbcf059bf6b9c5ef6e6a2cf0921dc84a25afc092c2b2da4088d3b833392c03dd20054b83195f40b16485a9ac04c
+  metadata.gz: fb0e270a99fc7a4a8cd07bf0f28d7c5d157976ffaf436523dd9ff871950726af552447f4e4eea1802eb0a59856702fd9ca95757b1b7fea3a89f69e5f83d3f876
+  data.tar.gz: 35d93ed7fdead4846059a22485ff56652b727640d754a2e4523350706966107418c3fb723eb278abaf17460f80612394dc61bf9720907294d821cacbc84e6b7f

data/lib/relaton_iso/data_fetcher.rb ADDED Viewed

@@ -0,0 +1,200 @@
+module RelatonIso
+  # Fetch all the documents from ISO website.
+  class DataFetcher
+    #
+    # Initialize data fetcher.
+    #
+    # @param [String] output output directory
+    # @param [String] format format of output files (yaml, bibxml, xml)
+    #
+    def initialize(output, format)
+      @output = output
+      @format = format
+      @ext = format.sub(/^bib/, "")
+      @files = []
+      @queue = ::Queue.new
+      @mutex = Mutex.new
+    end
+    def index
+      @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
+    end
+    def iso_queue
+      @iso_queue ||= RelatonIso::Queue.new
+    end
+    #
+    # Initialize data fetcher and fetch data.
+    #
+    # @param [String] output output directory (default: "data")
+    # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
+    #
+    # @return [void]
+    #
+    def self.fetch(output: "data", format: "yaml")
+      t1 = Time.now
+      puts "Started at: #{t1}"
+      FileUtils.mkdir_p output
+      new(output, format).fetch
+      t2 = Time.now
+      puts "Stopped at: #{t2}"
+      puts "Done in: #{(t2 - t1).round} sec."
+    end
+    #
+    # Go through all ICS and fetch all documents.
+    #
+    # @return [void]
+    #
+    def fetch # rubocop:disable Metrics/AbcSize
+      puts "Scrapping ICS pages..."
+      fetch_ics
+      puts "[#{Time.now}] Scrapping documents..."
+      fetch_docs
+      iso_queue.save
+      # index.sort! { |a, b| compare_docids a, b }
+      index.save
+    end
+    #
+    # Fetch ICS page recursively and store all the links to documents in the iso_queue.
+    #
+    # @param [String] path path to ICS page
+    #
+    def fetch_ics
+      threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
+      fetch_ics_page "/standards-catalogue/browse-by-ics.html"
+      sleep(1) until @queue.empty?
+      threads.size.times { @queue << :END }
+      threads.each(&:join)
+    end
+    def fetch_ics_page(path)
+      resp = get_redirection path
+      page = Nokogiri::HTML(resp.body)
+      page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
+        iso_queue.add_first item[:href].split("?").first
+      end
+      page.xpath("//td[@data-title='ICS']/a").each do |item|
+        @queue << item[:href]
+      end
+    end
+    #
+    # Get the page from the given path. If the page is redirected, get the
+    # page from the new path.
+    #
+    # @param [String] path path to the page
+    #
+    # @return [Net::HTTPOK] HTTP response
+    #
+    def get_redirection(path) # rubocop:disable Metrics/MethodLength
+      try = 0
+      uri = URI(Scrapper::DOMAIN + path)
+      begin
+        get_response uri
+      rescue Net::OpenTimeout, Net::ReadTimeout => e
+        try += 1
+        retry if check_try try, uri
+        warn "Error fetching #{uri}"
+        warn e.message
+      end
+    end
+    def get_response(uri)
+      resp = Net::HTTP.get_response(uri)
+      resp.code == "302" ? get_redirection(resp["location"]) : resp
+    end
+    def check_try(try, uri)
+      if try < 3
+        warn "Timeout fetching #{uri}, retrying..."
+        sleep 1
+        true
+      end
+    end
+    def fetch_docs
+      threads = Array.new(3) { thread { |path| fetch_doc(path) } }
+      iso_queue[0..10_000].each { |docpath| @queue << docpath }
+      threads.size.times { @queue << :END }
+      threads.each(&:join)
+    end
+    #
+    # Fetch document from ISO website.
+    #
+    # @param [String] docpath document page path
+    #
+    # @return [void]
+    #
+    def fetch_doc(docpath)
+      # path = docpath.sub(/\.html$/, "")
+      # hit = Hit.new({ path: docpath }, nil)
+      doc = Scrapper.parse_page docpath
+      @mutex.synchronize { save_doc doc, docpath }
+    rescue StandardError => e
+      warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
+      warn e.message
+      warn e.backtrace
+    end
+    # def compare_docids(id1, id2)
+    #   Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
+    # end
+    #
+    # save document to file.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
+    #
+    # @return [void]
+    #
+    def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
+      docid = doc.docidentifier.detect(&:primary)
+      file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
+      file = File.join @output, "#{file_name}.#{@ext}"
+      if @files.include? file
+        warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
+      else
+        @files << file
+        index.add_or_update docid.to_h, file
+        File.write file, serialize(doc), encoding: "UTF-8"
+      end
+      iso_queue.move_last docpath
+    end
+    #
+    # Serialize document to string.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
+    #
+    # @return [String] serialized document
+    #
+    def serialize(doc)
+      case @format
+      when "yaml" then doc.to_hash.to_yaml
+      when "bibxml" then doc.to_bibxml
+      when "xml" then doc.to_xml bibdata: true
+      end
+    end
+    private
+    #
+    # Create thread worker
+    #
+    # @return [Thread] thread
+    #
+    def thread
+      Thread.new do
+        while (path = @queue.pop) != :END
+          yield path
+        end
+      end
+    end
+  end
+end

data/lib/relaton_iso/document_identifier.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module RelatonIso
   class DocumentIdentifier < RelatonBib::DocumentIdentifier
-    def id
+    def id # rubocop:disable Metrics/MethodLength
       id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
       if @all_parts
         if type == "URN"
@@ -10,6 +10,12 @@ module RelatonIso
         end
       end
       type == "URN" ? @id.urn.to_s : id_str
+    rescue Pubid::Iso::Errors::NoEditionError => e
+      Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
+    end
+    def to_h
+      stringify_values(@id.to_h) if @id.respond_to? :to_h
     end
     def remove_part
@@ -23,5 +29,18 @@ module RelatonIso
     def all_parts
       @all_parts = true
     end
+    def stringify_values(hash)
+      hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
+    end
+    def stringify(val)
+      case val
+      when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
+      when Hash then stringify_values(val)
+      when Symbol then val
+      else val.to_s
+      end
+    end
   end
 end

data/lib/relaton_iso/hash_converter.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module RelatonIso
+  module HashConverter
+    include RelatonIsoBib::HashConverter
+    extend self
+    def create_docid(**args)
+      begin
+        args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
+      rescue StandardError
+        Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
+      end
+      DocumentIdentifier.new(**args)
+    end
+  end
+end

data/lib/relaton_iso/hit.rb CHANGED Viewed

@@ -4,28 +4,29 @@ module RelatonIso
   # Hit.
   class Hit < RelatonBib::Hit
     # @return [RelatonIsoBib::IsoBibliographicItem]
-    attr_writer :fetch, :pubid
+    attr_writer :fetch
+    # @return [Pubid::Iso::Identifier] pubid
+    attr_writer :pubid
     # Update edition for pubid when provided in Bibliographic Item
-    def update_edition(bibliographic_item)
-      if bibliographic_item.edition
-        # add edition to base document if available
-        if pubid.base
-          pubid.base.edition = bibliographic_item.edition.content
-        else
-          pubid.edition = bibliographic_item.edition.content
-        end
-      end
-    end
+    # def update_edition(bibliographic_item)
+    #   if bibliographic_item.edition
+    #     pubid.root.edition = bibliographic_item.edition.content
+    #   end
+    # end
     # Parse page.
     # @param lang [String, nil]
     # @return [RelatonIso::IsoBibliographicItem]
-    def fetch(lang = nil)
-      @fetch ||= Scrapper.parse_page self, lang
-      # update edition for pubid using fetched data
-      update_edition(@fetch)
-      @fetch
+    def fetch(_lang = nil)
+      @fetch ||= begin
+        url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
+        resp = Net::HTTP.get_response URI(url)
+        hash = YAML.safe_load resp.body
+        hash["fetched"] = Date.today.to_s
+        RelatonIsoBib::IsoBibliographicItem.from_hash hash
+      end
     end
     # @return [Integer]
@@ -41,11 +42,18 @@ module RelatonIso
     # @return [Pubid::Iso::Identifier]
     def pubid
-      @pubid ||= Pubid::Iso::Identifier.parse_from_title(hit[:title])
-    rescue Pubid::Iso::Errors::WrongTypeError,
-           Pubid::Iso::Errors::ParseError => e
-      Util.warn "Unable to find an identifier in: `#{hit[:title]}`."
-      Util.warn e.message
+      return @pubid if defined? @pubid
+      create_pubid hit[:id]
+    rescue StandardError
+      Util.warn "Unable to create an identifier from #{hit[:id]}"
+      @pubid = nil
+    end
+    private
+    def create_pubid(id)
+      @pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
     end
   end
 end

data/lib/relaton_iso/hit_collection.rb CHANGED Viewed

@@ -6,82 +6,97 @@ require "relaton_iso/hit"
 module RelatonIso
   # Page of hit collection.
   class HitCollection < RelatonBib::HitCollection
-    # @return [Boolean] whether the search was performed on GitHub
-    attr_reader :from_gh
+    INDEXFILE = "index-v1.yaml"
+    ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
-    # @param text [String] reference to search
-    def initialize(text)
+    # @param text [Pubid::Iso::Identifier] reference to search
+    def initialize(pubid, opts = {})
       super
-      @from_gh = text.match?(/^ISO[\s\/](?:TC\s184\/SC\s?4|IEC\sDIR\s(?:\d|IEC|JTC))/)
+      @opts = opts
     end
-    def fetch
-      @array = from_gh ? fetch_github : fetch_iso
+    # @return [Pubid::Iso::Identifier]
+    alias ref_pubid text
+    def ref_pubid_no_year
+      @ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
+    end
+    def ref_pubid_excluded
+      @ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
+    end
+    def fetch # rubocop:disable Metrics/AbcSize
+      @array = index.search do |row|
+        row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
+      end.map { |row| Hit.new row, self }
+        .sort_by! { |h| h.pubid.to_s }
+        .reverse!
       self
     end
-    # @param lang [String, NilClass]
+    def pubid_match?(id)
+      pubid = create_pubid(id)
+      return false unless pubid
+      pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
+      dir_excludings = excludings.dup
+      dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
+      pubid.exclude(*dir_excludings) == ref_pubid_excluded
+    end
+    def create_pubid(id)
+      Pubid::Iso::Identifier.create(**id)
+    rescue StandardError => e
+      Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
+      nil
+    end
+    def excludings
+      return @excludings if defined? @excludings
+      excl_parts = %i[year]
+      excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
+      if ref_pubid.stage.nil? || @opts[:all_parts]
+        excl_parts << :stage
+        excl_parts << :iteration
+      end
+      # excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
+      @escludings = excl_parts
+    end
+    def index
+      @index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
+    end
+    def fetch_doc
+      if !@opts[:all_parts] || size == 1
+        any? && first.fetch(@opts[:lang])
+      else
+        to_all_parts(@opts[:lang])
+      end
+    end
+    # @param lang [String, nil]
     # @return [RelatonIsoBib::IsoBibliographicItem, nil]
-    def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      # parts = @array.reject { |h| h.hit["docPart"]&.empty? }
+    def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
       hit = @array.min_by { |h| h.pubid.part.to_i }
       return @array.first&.fetch lang unless hit
       bibitem = hit.fetch(lang)
       all_parts_item = bibitem.to_all_parts
-      @array.reject { |h| h.hit[:uuid] == hit.hit[:uuid] }.each do |hi|
-        isobib = RelatonIsoBib::IsoBibliographicItem.new(
-          formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
-        )
-        all_parts_item.relation << RelatonBib::DocumentRelation.new(
-          type: "instanceOf", bibitem: isobib,
-        )
+      @array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
+        all_parts_item.relation << create_relation(hi)
       end
       all_parts_item
     end
-    private
-    #
-    # Fetch document from GitHub repository
-    #
-    # @return [Array<RelatonIso::Hit]
-    #
-    def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      ref = text.gsub(/[\s\/]/, "_").upcase
-      url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
-      resp = Net::HTTP.get_response URI(url)
-      return [] unless resp.code == "200"
-      hash = YAML.safe_load resp.body
-      bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
-      bib_hash[:fetched] = Date.today.to_s
-      bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
-      hit = Hit.new({ title: text }, self)
-      hit.fetch = bib
-      [hit]
-    end
-    #
-    # Fetch hits from iso.org
-    #
-    # @return [Array<RelatonIso::Hit>]
-    #
-    def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
-      config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
-      client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
-      index = client.init_index "all_en"
-      resp = index.search text, hitsPerPage: 100, filters: "category:standard"
-      resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
-        if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
-          a.hit[:title] <=> b.hit[:title]
-        elsif a.sort_weight == b.sort_weight
-          b.hit[:year] - a.hit[:year]
-        else
-          a.sort_weight - b.sort_weight
-        end
-      end
+    def create_relation(hit)
+      docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
+      isobib = RelatonIsoBib::IsoBibliographicItem.new(
+        formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
+      )
+      RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
     end
   end
 end

data/lib/relaton_iso/index.rb ADDED Viewed

@@ -0,0 +1,132 @@
+module RelatonIso
+  # Index.
+  class Index
+    #
+    # Initialise index. If file path is given, read index from file. If file is not
+    # given, look for it in a `/home/USER/.relaton/iso` directory. If file
+    # doesn't exist, or is outdated then fetch index from GitHub.
+    #
+    # @param [String, nil] file path to index file.
+    #
+    def initialize(file = nil)
+      @file = file
+    end
+    #
+    # Create index.
+    #
+    # @return [Array<Hash>] index
+    #
+    def index
+      @index ||= read_index || read_from_user_dir || fetch_index
+    end
+    #
+    # Add or update index entry.
+    #
+    # @param [RelatonIsoBib::IsoBibliographicItem] item document
+    #
+    # @return [void]
+    #
+    def <<(item)
+      id = item.docidentifier.detect(&:primary).id
+      row = self[id] || begin
+        r = { id: id }
+        index << r
+        r
+      end
+      row[:title] = item.title.first.title.content
+    end
+    #
+    # Fetch document from index by ID.
+    #
+    # @param [String] id document ID
+    #
+    # @return [Hash] index entry
+    #
+    def [](id)
+      index.detect { |i| i[:id] == id }
+    end
+    #
+    # Save index to file.
+    #
+    # @return [void]
+    #
+    def save
+      serialize_and_save index
+    end
+    private
+    #
+    # Serialize index and save to file.
+    #
+    # @param [Array<Hash>] idx index
+    #
+    # @return [void]
+    #
+    def serialize_and_save(idx)
+      File.open(@file, "w:UTF-8") do |f|
+        f.puts "---"
+        idx.each do |i|
+          f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
+        end
+      end
+    end
+    #
+    # Read index from file. If file doesn't exist, create empty index.
+    #
+    # @return [Array<Hash>, nil] index
+    #
+    def read_index
+      if @file && File.exist?(@file) then read_file
+      elsif @file then []
+      end
+    end
+    #
+    # Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
+    # if file doesn't exist, or is outdated.
+    #
+    # @return [Array<Hash>] index
+    #
+    def read_from_user_dir
+      @file = File.join(Dir.home, "index.yml")
+      read_file if File.exist?(@file) && !outdated?
+    end
+    def read_file
+      yaml = File.read @file, encoding: "UTF-8"
+      RelatonBib.parse_yaml yaml, [], symbolize_names: true
+    end
+    #
+    # Check if index file is outdated.
+    #
+    # @return [Boolean] true if older than 24 hours
+    #
+    def outdated?
+      (Time.now - File.mtime(@file)) / 3600 > 24
+    end
+    #
+    # Fetch index from GitHub.
+    #
+    # @return [Array<Hash>] index
+    #
+    def fetch_index
+      url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
+      zip = Zip::InputStream.new URI(url).open
+      yaml = zip.get_next_entry.get_input_stream.read
+      idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
+      serialize_and_save idx
+      idx
+    rescue OpenURI::HTTPError => e
+      warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
+      []
+    end
+  end
+end