RubyGems - relaton-nist - Versions diffs - 0.1.0 - Mend

relaton-nist 0.1.0

Files changed (27) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.rspec +3 -0
data/.rubocop.yml +10 -0
data/.travis.yml +17 -0
data/Gemfile +7 -0
data/Gemfile.lock +85 -0
data/LICENSE.txt +21 -0
data/README.adoc +128 -0
data/Rakefile +6 -0
data/appveyor.yml +30 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/relaton/processor.rb +23 -0
data/lib/relaton_nist/comment_period.rb +30 -0
data/lib/relaton_nist/document_status.rb +24 -0
data/lib/relaton_nist/hit.rb +54 -0
data/lib/relaton_nist/hit_collection.rb +90 -0
data/lib/relaton_nist/keyword.rb +16 -0
data/lib/relaton_nist/nist_bibliographic_item.rb +78 -0
data/lib/relaton_nist/nist_bibliography.rb +150 -0
data/lib/relaton_nist/scrapper.rb +329 -0
data/lib/relaton_nist/version.rb +3 -0
data/lib/relaton_nist/xml_parser.rb +47 -0
data/lib/relaton_nist.rb +12 -0
data/relaton_nist.gemspec +39 -0
metadata +237 -0

data/lib/relaton_nist/nist_bibliography.rb ADDED Viewed

@@ -0,0 +1,150 @@
+require "relaton_bib"
+require "relaton_nist/nist_bibliographic_item"
+require "relaton_nist/scrapper"
+require "relaton_nist/hit_collection"
+require "relaton_nist/xml_parser"
+require "relaton_nist/keyword"
+require "relaton_nist/comment_period"
+require "relaton_nist/document_status"
+module RelatonNist
+  class NistBibliography
+    class << self
+      # @param text [String]
+      # @return [RelatonNist::HitCollection]
+      def search(text, year = nil, opts = {})
+        HitCollection.new text, year, opts
+      rescue OpenURI::HTTPError, SocketError
+        warn "Could not access https://www.nist.gov"
+        []
+      end
+      # @param code [String] the NIST standard Code to look up (e..g "8200")
+      # @param year [String] the year the standard was published (optional)
+      #
+      # @param opts [Hash] options
+      # @option opts [TrueClass, FalseClass] :all_parts restricted to all parts
+      #   if all-parts reference is required
+      # @option opts [TrueClass, FalseClass] :bibdata
+      #
+      # @return [String] Relaton XML serialisation of reference
+      def get(code, year = nil, opts = {})
+        /^(?<code2>[^\(]+)(\((?<date2>\w+\s(\d{2},\s)?\d{4})\))?\s?\(?((?<=\()(?<stage>[^\)]+))?/ =~ code
+        if code2
+          code = code2.strip
+          if date2
+            if /\w+\s\d{4}/ =~ date2
+              opts[:issued_date] = Time.strptime date2, "%B %Y"
+            elsif /\w+\s\d{2},\s\d{4}/ =~ date2
+              opts[:updated_date] = Time.strptime date2, "%B %d, %Y"
+            end
+          end
+          opts[:stage] = stage if stage
+        end
+        if year.nil?
+          /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
+          unless code1.nil?
+            code = code1
+            year = year1
+          end
+        end
+        code += "-1" if opts[:all_parts]
+        ret = nistbib_get1(code, year, opts)
+        # return nil if ret.nil?
+        # ret.to_most_recent_reference unless year || opts[:keep_year]
+        # ret.to_all_parts if opts[:all_parts]
+        ret
+      end
+      private
+      def nistbib_get1(code, year, opts)
+        result = nistbib_search_filter(code, year, opts) || (return nil)
+        ret = nistbib_results_filter(result, year, opts)
+        return ret[:ret] if ret[:ret]
+        fetch_ref_err(code, year, ret[:years])
+      end
+      # Sort through the results from RelatonNist, fetching them three at a time,
+      # and return the first result that matches the code,
+      # matches the year (if provided), and which # has a title (amendments do not).
+      # Only expects the first page of results to be populated.
+      # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
+      # If no match, returns any years which caused mismatch, for error reporting
+      #
+      # @param opts [Hash] options
+      # @option opts [Time] :issued_date
+      # @option opts [Time] :issued_date
+      # @option opts [String] :stage
+      #
+      # @retur [Hash]
+      def nistbib_results_filter(result, year, opts)
+        missed_years = []
+        result.each_slice(3) do |s| # ISO website only allows 3 connections
+          fetch_pages(s, 3).each_with_index do |r, _i|
+            if opts[:issued_date]
+              r.dates.select { |d| d.type == "issued" }.each do |d|
+                next unless opts[:issued_date] == d.on
+              end
+            elsif opts[:updated_date]
+              r.dates.select { |d| d.type == "published" }.each do |d|
+                next unless opts[:updated_date] == d.on
+              end
+            end
+            if opts[:stage]
+              iter = opts[:stage][-3]
+              iteration = case iter
+                          when "I" then 1
+                          when "F" then "final"
+                          else iter.to_i
+                          end
+              next if iter && r.status.iteration != iteration
+            end
+            return { ret: r } if !year
+            r.dates.select { |d| d.type == "published" }.each do |d|
+              return { ret: r } if year.to_i == d.on.year
+              missed_years << d.on.year
+            end
+          end
+        end
+        { years: missed_years }
+      end
+      def fetch_pages(s, n)
+        workers = RelatonBib::WorkersPool.new n
+        workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
+        s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
+        workers.end
+        workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
+      end
+      def nistbib_search_filter(code, year, opts)
+        docid = code.match(%r{[0-9-]{3,}}).to_s
+        serie = code.match(%r{(FISP|SP|NISTIR)(?=\s)})
+        warn "fetching #{code}..."
+        result = search(code, year, opts)
+        result.select do |i|
+          i.hit[:code]&.include?(docid) && (!serie || i.hit[:serie] == serie.to_s)
+        end
+      end
+      def fetch_ref_err(code, year, missed_years)
+        id = year ? "#{code}:#{year}" : code
+        warn "WARNING: no match found online for #{id}. "\
+          "The code must be exactly like it is on the standards website."
+        warn "(There was no match for #{year}, though there were matches "\
+          "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
+        if /\d-\d/ =~ code
+          warn "The provided document part may not exist, or the document "\
+            "may no longer be published in parts."
+        end
+        nil
+      end
+    end
+  end
+end

data/lib/relaton_nist/scrapper.rb ADDED Viewed

@@ -0,0 +1,329 @@
+require "relaton_bib"
+module RelatonNist
+  class Scrapper
+    class << self
+      DOMAIN = "https://csrc.nist.gov".freeze
+      # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+      # Parse page.
+      # @param hit_data [Hash]
+      # @return [Hash]
+      def parse_page(hit_data)
+        doc = get_page hit_data[:url]
+        docid = fetch_docid(doc)
+        doctype = "standard"
+        titles = fetch_titles(hit_data)
+        unless /^(SP|NISTIR|FIPS) /.match docid[0].id
+          doctype = id_cleanup(docid[0].id)
+          docid[0] = RelatonBib::DocumentIdentifier.new(id: titles[0][:content], type: "NIST")
+        end
+        NistBibliographicItem.new(
+          fetched: Date.today.to_s,
+          type: "standard",
+          # id: fetch_id(doc),
+          titles: titles,
+          link: fetch_link(doc),
+          docid: docid,
+          dates: fetch_dates(doc, hit_data[:release_date]),
+          contributors: fetch_contributors(doc),
+          edition: fetch_edition(hit_data[:code]),
+          language: ["en"],
+          script: ["Latn"],
+          abstract: fetch_abstract(doc),
+          docstatus: fetch_status(doc, hit_data[:status]),
+          copyright: fetch_copyright(doc),
+          relations: fetch_relations(doc),
+          series: fetch_series(doc),
+          keyword: fetch_keywords(doc),
+          commentperiod: fetch_commentperiod(doc),
+          doctype: doctype,
+        )
+      end
+      # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+      # Strip status from doc id
+      # @param id String
+      # @return String
+      def id_cleanup(id)
+        id.sub(/ \(WITHDRAWN\)/, "").sub(/ \(([^) ]+ )?DRAFT\)/i, "")
+      end
+      private
+      # Get page.
+      # @param path [String] page's path
+      # @return [Array<Nokogiri::HTML::Document, String>]
+      def get_page(url)
+        uri = URI url
+        resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
+        Nokogiri::HTML(resp.body)
+      end
+      # Fetch docid.
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Array<RelatonBib::DocumentIdentifier>]
+      def fetch_docid(doc)
+        item_ref = doc.at("//div[contains(@class, 'publications-detail')]/h3").
+          text.strip
+        return [RelatonBib::DocumentIdentifier.new(type: "NIST", id: "?")] unless item_ref
+        [RelatonBib::DocumentIdentifier.new(id: item_ref, type: "NIST")]
+      end
+      # Fetch id.
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [String]
+      # def fetch_id(doc)
+      #   doc.at("//div[contains(@class, 'publications-detail')]/h3").text.
+      #     strip.gsub(/\s/, "")
+      # end
+      # Fetch status.
+      # @param doc [Nokogiri::HTML::Document]
+      # @param status [String]
+      # @return [Hash]
+      def fetch_status(doc, status)
+        case status
+        when "draft (withdrawn)"
+          stage = "draft-public"
+          subst = "withdrawn"
+        when "retired draft"
+          stage = "draft-public"
+          subst = "retired"
+        when "withdrawn"
+          stage = "final"
+          subst = "withdrawn"
+        when "draft"
+          stage = "draft-public"
+          subst = "active"
+        else
+          stage = status
+          subst = "active"
+        end
+        iter = nil
+        if stage.include? "draft"
+          iter = 1
+          history = doc.xpath("//span[@id='pub-history-container']/a"\
+                              "|//span[@id='pub-history-container']/span")
+          history.each_with_index do |h, idx|
+            next if h.name == "a"
+            iter = idx + 1 if idx.positive?
+            # iter = if lsif idx < (history.size - 1) && !history.last.text.include?("Draft")
+            #          "final"
+            #        elsif idx.positive? then idx + 1
+            #        end
+            break
+          end
+        end
+        # if doc.at "//p/strong[text()='Withdrawn:']"
+        #   substage = "withdrawn"
+        # else
+        #   substage = "active"
+        #   item_ref = doc.at(
+        #     "//div[contains(@class, 'publications-detail')]/h3",
+        #   ).text.strip
+        #   wip = item_ref.match(/(?<=\()\w+/).to_s
+        #   stage = "draft-public" if wip == "DRAFT"
+        # end
+        RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter
+      end
+      # Fetch titles.
+      # @param hit_data [Hash]
+      # @return [Array<Hash>]
+      def fetch_titles(hit_data)
+        [{ content: hit_data[:title], language: "en", script: "Latn", format: "text/plain" }]
+      end
+      # Fetch dates
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Array<Hash>]
+      def fetch_dates(doc, release_date)
+        dates = [{ type: "published", on: release_date.to_s }]
+        d = doc.at("//span[@id='pub-release-date']").text.strip
+        date = if /(?<date>\w+\s\d{4})/ =~ d
+                 Date.strptime(date, "%B %Y")
+               elsif /(?<date>\w+\s\d{1,2},\s\d{4})/ =~ d
+                 Date.strptime(date, "%B %d, %Y")
+               end
+        dates << { type: "issued", on: date.to_s }
+        dates
+      end
+      def fetch_contributors(doc)
+        name = "National Institute of Standards and Technology"
+        org = RelatonBib::Organization.new(
+          name: name, url: "www.nist.gov", abbreviation: "NIST",
+        )
+        contribs = [
+          RelatonBib::ContributionInfo.new(entity: org, role: ["publisher"]),
+        ]
+        authors = doc.at('//h4[.="Author(s)"]/following-sibling::p')
+        contribs += contributors(authors, "author")
+        editors = doc.at('//h4[.="Editor(s)"]/following-sibling::p')
+        contribs + contributors(editors, "editor")
+      end
+      # rubocop:disable Metrics/CyclomaticComplexity
+      def contributors(doc, role)
+        return [] if doc.nil?
+        doc.text.split(", ").map do |contr|
+          /(?<an>.+?)(\s+\((?<abbrev>.+?)\))?$/ =~ contr
+          if abbrev && an.downcase !~ /(task|force|group)/ && an.split.size.between?(2, 3)
+            fullname = RelatonBib::FullName.new(
+              completename: RelatonBib::LocalizedString.new(an, "en", "Latn"),
+            )
+            case abbrev
+            when "NIST"
+              org_name = "National Institute of Standards and Technology"
+              url = "www.nist.gov"
+            when "MITRE"
+              org_name = abbrev
+              url = "www.mitre.org"
+            else
+              org_name = abbrev
+              url = nil
+            end
+            org = RelatonBib::Organization.new name: org_name, url: url, abbreviation: abbrev
+            affiliation = RelatonBib::Affilation.new org
+            entity = RelatonBib::Person.new(
+              name: fullname, affiliation: [affiliation], contacts: [],
+            )
+          else
+            entity = RelatonBib::Organization.new name: an, abbreviation: abbrev
+          end
+          RelatonBib::ContributionInfo.new entity: entity, role: [role]
+        end
+      end
+      # rubocop:enable Metrics/CyclomaticComplexity
+      def fetch_edition(code)
+        return unless /(?<=Rev\.\s)(?<rev>\d+)/ =~ code
+        "Revision #{rev}"
+      end
+      # Fetch abstracts.
+      # @param doc [Nokigiri::HTML::Document]
+      # @return [Array<Array>]
+      def fetch_abstract(doc)
+        abstract_content = doc.xpath('//div[contains(@class, "pub-abstract-callout")]/div[1]/p').text
+        [{
+          content: abstract_content,
+          language: "en",
+          script: "Latn",
+          format: "text/plain",
+        }]
+      end
+      # Fetch copyright.
+      # @param title [String]
+      # @return [Hash]
+      def fetch_copyright(doc)
+        name = "National Institute of Standards and Technology"
+        url = "www.nist.gov"
+        d = doc.at("//span[@id='pub-release-date']").text.strip
+        from = d.match(/\d{4}/).to_s
+        { owner: { name: name, abbreviation: "NIST", url: url }, from: from }
+      end
+      # Fetch links.
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Array<Hash>]
+      def fetch_link(doc)
+        pub = doc.at "//p/strong[.='Publication:']"
+        links = []
+        pdf = pub.at "./following-sibling::a[.=' Local Download']"
+        links << { type: "pdf", content: pdf[:href] } if pdf
+        doi = pub.at("./following-sibling::a[contains(.,'(DOI)')]")
+        links << { type: "doi", content: doi[:href] } if doi
+        links
+      end
+      # Fetch relations.
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Array<Hash>]
+      def fetch_relations(doc)
+        relations = doc.xpath('//span[@id="pub-supersedes-container"]/a').map do |r|
+          doc_relation "supersedes", r
+        end
+        relations += doc.xpath('//span[@id="pub-part-container"]/a').map do |r|
+          doc_relation "partOf", r
+        end
+        relations + doc.xpath('//span[@id="pub-related-container"]/a').map do |r|
+          doc_relation "updates", r
+        end
+      end
+      def doc_relation(type, ref)
+        RelatonBib::DocumentRelation.new(
+          type: type,
+          bibitem: RelatonBib::BibliographicItem.new(
+            formattedref: RelatonBib::FormattedRef.new(
+              content: ref.text, language: "en", script: "Latn", format: "text/plain",
+            ),
+            link: [RelatonBib::TypedUri.new(type: "src", content: DOMAIN + ref[:href])],
+          ),
+        )
+      end
+      def fetch_series(doc)
+        series = doc.xpath "//span[@id='pub-history-container']/a"\
+          "|//span[@id='pub-history-container']/span"
+        series.map.with_index do |s, idx|
+          next if s.name == "span"
+          iter = if idx.zero? then "I"
+                   #  elsif status == "final" && idx == (series.size - 1) then "F"
+                 else idx + 1
+                 end
+          content = s.text.match(/^[^\(]+/).to_s.strip.gsub "  ", " "
+          ref = case content.match(/\w+/).to_s
+                when "Draft" then content.match(/(?<=Draft\s).+/).to_s + " (#{iter}PD)"
+                end
+          fref = RelatonBib::FormattedRef.new(
+            content: ref, language: "en", script: "Latn", format: "text/plain",
+          )
+          RelatonBib::Series.new(formattedref: fref)
+        end.select { |s| s }
+      end
+      def fetch_keywords(doc)
+        kws = doc.xpath "//span[@id='pub-keywords-container']/span"
+        kws.map { |kw| Keyword.new kw.text }
+      end
+      def fetch_commentperiod(doc)
+        cp = doc.at "//span[@id='pub-comments-due']"
+        return unless cp
+        to = Date.strptime cp.text.strip, "%B %d, %Y"
+        d = doc.at("//span[@id='pub-release-date']").text.strip
+        from = Date.strptime(d, "%B %Y").to_s
+        ex = doc.at "//strong[contains(.,'The comment closing date has been extended to')]"
+        ext = ex&.text&.match(/\w+\s\d{2},\s\d{4}/).to_s
+        extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
+        CommentPeriod.new from, to, extended
+      end
+    end
+  end
+end

data/lib/relaton_nist/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module RelatonNist
+  VERSION = "0.1.0".freeze
+end

data/lib/relaton_nist/xml_parser.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module RelatonNist
+  class XMLParser < RelatonBib::XMLParser
+    class << self
+      def from_xml(xml)
+        doc = Nokogiri::XML xml
+        nistitem = doc.at("/bibitem|/bibdata")
+        NistBibliographicItem.new(item_data(nistitem))
+      end
+      private
+      def item_data(nistitem)
+        data = super
+        ext = nistitem.at "./ext"
+        return data unless ext
+        data[:keyword] = fetch_keyword(ext)
+        data[:commentperiod] = fetch_commentperiod(ext)
+        data
+      end
+      def fetch_status(item)
+        status = item.at "./status"
+        return unless status
+        DocumentStatus.new(
+          stage: status.at("stage")&.text,
+          substage: status.at("substage")&.text,
+          iteration: status.at("iteration")&.text,
+        )
+      end
+      def fetch_commentperiod(item)
+        cp = item.at "./commentperiod"
+        return unless cp
+        CommentPeriod.new cp.at("from").text, cp.at("to")&.text, cp.at("extended")&.text
+      end
+      def fetch_keyword(item)
+        item.xpath("./keyword").map do |kw|
+          Keyword.new kw.children.first.to_xml
+        end
+      end
+    end
+  end
+end

data/lib/relaton_nist.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require "relaton_nist/version"
+require "relaton_nist/nist_bibliography"
+if defined? Relaton
+  require_relative "relaton/processor"
+  Relaton::Registry.instance.register(Relaton::RelatonNist::Processor)
+end
+module RelatonNist
+  class Error < StandardError; end
+  # Your code goes here...
+end

data/relaton_nist.gemspec ADDED Viewed

@@ -0,0 +1,39 @@
+lib = File.expand_path("../lib", __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require "relaton_nist/version"
+Gem::Specification.new do |spec|
+  spec.name          = "relaton-nist"
+  spec.version       = RelatonNist::VERSION
+  spec.authors       = ["Ribose Inc."]
+  spec.email         = ["open.source@ribose.com"]
+  spec.summary       = "RelatonNist: retrive NIST standards."
+  spec.description   = "RelatonNist: retrive NIST standards."
+  spec.homepage      = "https://github.com/metanorma/relaton-nist"
+  spec.license       = "MIT"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files         = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
+  spec.add_development_dependency "bundler", "~> 2.0"
+  spec.add_development_dependency "byebug"
+  spec.add_development_dependency "debase"
+  spec.add_development_dependency "equivalent-xml", "~> 0.6"
+  spec.add_development_dependency "pry-byebug"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_development_dependency "ruby-debug-ide"
+  spec.add_development_dependency "simplecov"
+  spec.add_development_dependency "vcr"
+  spec.add_development_dependency "webmock"
+  spec.add_dependency "relaton-bib", "~> 0.1.6"
+end