RubyGems - relaton-gb - Versions diffs - 0.1.0 - Mend

relaton-gb 0.1.0

Files changed (31) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.hound.yml +3 -0
data/.rspec +3 -0
data/.rubocop.yml +10 -0
data/.travis.yml +17 -0
data/Gemfile +9 -0
data/Gemfile.lock +84 -0
data/LICENSE.txt +25 -0
data/README.adoc +202 -0
data/Rakefile +6 -0
data/appveyor.yml +35 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/relaton/processor.rb +25 -0
data/lib/relaton_gb.rb +7 -0
data/lib/relaton_gb/gb_bibliographic_item.rb +111 -0
data/lib/relaton_gb/gb_bibliography.rb +137 -0
data/lib/relaton_gb/gb_scrapper.rb +51 -0
data/lib/relaton_gb/gb_standard_type.rb +30 -0
data/lib/relaton_gb/gb_technical_committee.rb +23 -0
data/lib/relaton_gb/hit.rb +60 -0
data/lib/relaton_gb/hit_collection.rb +45 -0
data/lib/relaton_gb/scrapper.rb +197 -0
data/lib/relaton_gb/sec_scrapper.rb +57 -0
data/lib/relaton_gb/t_scrapper.rb +121 -0
data/lib/relaton_gb/version.rb +5 -0
data/lib/relaton_gb/xml_parser.rb +49 -0
data/lib/relaton_gb/yaml/prefixes.yaml +197 -0
data/relaton_gb.gemspec +39 -0
metadata +229 -0

data/lib/relaton_gb/hit_collection.rb ADDED

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+module RelatonGb
+  # Page of hit collection
+  class HitCollection < Array
+    # @return [TrueClass, FalseClass]
+    attr_reader :fetched
+    # @return [Isobib::HitPages]
+    attr_reader :hit_pages
+    # @return [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
+    attr_reader :scrapper
+    # @param hits [Array<Hash>]
+    # @param hit_pages [Integer]
+    # @param scrapper [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
+    def initialize(hits = [], hit_pages = nil)
+      concat hits
+      @fetched   = false
+      @hit_pages = hit_pages
+    end
+    # @return [RelatonGb::HitCollection]
+    # def fetch
+    #   workers = RelatonBib::WorkersPool.new 4
+    #   workers.worker(&:fetch)
+    #   each do |hit|
+    #     workers << hit
+    #   end
+    #   workers.end
+    #   workers.result
+    #   @fetched = true
+    #   self
+    # end
+    def to_s
+      inspect
+    end
+    def inspect
+      "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
+    end
+  end
+end

data/lib/relaton_gb/scrapper.rb ADDED

@@ -0,0 +1,197 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require "yaml"
+require "gb_agencies"
+module RelatonGb
+  # Common scrapping methods.
+  module Scrapper
+    @prefixes = nil
+    # rubocop:disable Metrics/MethodLength
+    # @param doc [Nokogiri::HTML::Document]
+    # @param src [String] url of scrapped page
+    # @return [Hash]
+    def scrapped_data(doc, src:)
+      {
+        committee: get_committee(doc),
+        docid: get_docid(doc),
+        titles: get_titles(doc),
+        contributors: get_contributors(doc),
+        type: get_type(doc),
+        docstatus: get_status(doc),
+        gbtype: get_gbtype(doc),
+        ccs: get_ccs(doc),
+        ics: get_ics(doc),
+        link: [{ type: "src", content: src }],
+        dates: get_dates(doc),
+        language: ["zh"],
+        script: ["Hans"],
+        structuredidentifier: fetch_structuredidentifier(doc),
+      }
+    end
+    # rubocop:enable Metrics/MethodLength
+    # @param doc [Nokogiri::HTML::Document]
+    # @param xpt [String]
+    # @return [Array<RelatonBib::DocumentIdentifier>]
+    def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
+      item_ref = doc.at xpt
+      return [] unless item_ref
+      [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @param xpt [String]
+    # @return [RelatonIsoBib::StructuredIdentifier]
+    def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
+      item_ref = doc.at xpt
+      unless item_ref
+        return RelatonIsoBib::StructuredIdentifier.new(
+          project_number: "?", part_number: "?", prefix: nil, id: "?",
+          type: "Chinese Standard"
+        )
+      end
+      m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
+      # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
+      RelatonIsoBib::StructuredIdentifier.new(
+        project_number: m[1], part_number: m[2], prefix: nil,
+        id: item_ref.text, type: "Chinese Standard"
+      )
+    end
+    def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
+      gb_en = GbAgencies::Agencies.new("en", {}, "")
+      gb_zh = GbAgencies::Agencies.new("zh", {}, "")
+      name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
+      name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
+      gbtype = get_gbtype(doc)
+      entity = RelatonBib::Organization.new name: [
+        { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
+        { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
+      ]
+      [{ entity: entity, roles: ["publisher"] }]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :title_intro [String]
+    #   * :title_main [String]
+    #   * :language [String]
+    #   * :script [String]
+    def get_titles(doc)
+      titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
+                  language: "zh", script: "Hans" }]
+      title_main = doc.css("div.page-header h5").text
+      unless title_main.empty?
+        titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
+      end
+      titles
+    end
+    def get_type(_doc)
+      "international-standard"
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @param xpt [String]
+    # @return [RelatonBib::DocumentStatus]
+    def get_status(doc, xpt = ".s-status.label:nth-child(3)")
+      case doc.at(xpt).text.gsub(/\s/, "")
+      when "即将实施"
+        stage = "published"
+      when "现行"
+        stage = "activated"
+      when "废止"
+        stage = "obsoleted"
+      end
+      RelatonBib::DocumentStatus.new stage: stage
+    end
+    private
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    #   * :scope [String]
+    #   * :prefix [String]
+    #   * :mandate [String]
+    def get_gbtype(doc)
+      ref = get_ref(doc)
+      { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
+        mandate: get_mandate(ref) }
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [String]
+    def get_ref(doc)
+      doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<String>]
+    def get_ccs(doc)
+      [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :field [String]
+    #   * :group [String]
+    #   * :subgroup [String]
+    def get_ics(doc)
+      ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
+      return [] if ics.empty?
+      field, group, subgroup = ics.text.split "."
+      [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [String]
+    def get_scope(doc)
+      scope = doc.at(".s-status.label-info").text
+      if scope == "国家标准"
+        "national"
+      elsif scope =~ /^行业标准/
+        "sector"
+      end
+    end
+    # @param ref [String]
+    # @return [String]
+    def get_prefix(ref)
+      pref = ref.match(/^[^\s]+/).to_s.split("/").first
+      prefix pref
+    end
+    # @param pref [String]
+    # @return [Hash{String=>String}]
+    def prefix(pref)
+      file_path = File.join(__dir__, "yaml/prefixes.yaml")
+      @prefixes ||= YAML.load_file(file_path)
+      @prefixes[pref]
+    end
+    # @param ref [String]
+    # @return [String]
+    def get_mandate(ref)
+      case ref.match(%r{(?<=\/)[^\s]+}).to_s
+      when "T" then "recommended"
+      when "Z" then "guidelines"
+      else "mandatory"
+      end
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :type [String] type of date
+    #   * :on [String] date
+    def get_dates(doc)
+      date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
+      [{ type: "published", on: date }]
+    end
+  end
+end

data/lib/relaton_gb/sec_scrapper.rb ADDED

@@ -0,0 +1,57 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require "net/http"
+require "json"
+require "nokogiri"
+require "relaton_gb/scrapper"
+require "relaton_gb/gb_bibliographic_item"
+require "relaton_gb/hit_collection"
+require "relaton_gb/hit"
+module RelatonGb
+  # Sector standard scrapper
+  module SecScrapper
+    extend Scrapper
+    class << self
+      # @param text [String] code of standard for serarch
+      # @return [RelatonGb::HitCollection]
+      def scrape_page(text)
+        uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
+        res = JSON.parse Net::HTTP.get(uri)
+        hits = res["rows"].map do |r|
+          Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
+        end
+        HitCollection.new hits
+      rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
+             Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
+        warn "Cannot access #{uri}"
+      end
+      # @param pid [String] standard's page id
+      # @return [RelatonGb::GbBibliographicItem]
+      def scrape_doc(pid)
+        src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
+        page_uri = URI src
+        doc = Nokogiri::HTML Net::HTTP.get(page_uri)
+        GbBibliographicItem.new scrapped_data(doc, src: src)
+      rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
+             Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
+        warn "Cannot access #{src}"
+      end
+      private
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Hash]
+      #   * :type [String]
+      #   * :name [String]
+      def get_committee(doc)
+        ref = get_ref(doc)
+        name = get_prefix(ref)["administration"]
+        { type: "technical", name: name }
+      end
+    end
+  end
+end

data/lib/relaton_gb/t_scrapper.rb ADDED

@@ -0,0 +1,121 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require "open-uri"
+require "nokogiri"
+require "relaton_gb/scrapper"
+require "relaton_gb/gb_bibliographic_item"
+require "relaton_gb/hit_collection"
+require "relaton_gb/hit"
+module RelatonGb
+  # Social standard scarpper.
+  module TScrapper
+    extend Scrapper
+    class << self
+      # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
+      # @param text [String]
+      # @return [RelatonGb::HitCollection]
+      def scrape_page(text)
+        search_html = OpenURI.open_uri(
+          "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
+          CGI.escape(text.tr("-", [8212].pack("U"))),
+        )
+        header = Nokogiri::HTML search_html
+        xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
+        t_xpath = "../preceding-sibling::td[3]"
+        hits = header.xpath(xpath).map do |h|
+          title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
+          Hit.new pid: h[:href].sub(%r{\/$}, ""), title: title, scrapper: self
+        end
+        HitCollection.new hits
+      rescue OpenURI::HTTPError, SocketError
+        warn "Cannot access http://www.ttbz.org.cn/Home/Standard"
+      end
+      # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
+      # @param pid [String] standard's page path
+      # @return [RelatonGb::GbBibliographicItem]
+      def scrape_doc(pid)
+        src = "http://www.ttbz.org.cn#{pid}"
+        doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
+        GbBibliographicItem.new scrapped_data(doc, src: src)
+      rescue OpenURI::HTTPError, SocketError
+        warn "Cannot access #{src}"
+      end
+      private
+      # rubocop:disable Metrics/MethodLength
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Hash]
+      def scrapped_data(doc, src:)
+        docid_xpt  = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
+        status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
+        {
+          committee: get_committee(doc),
+          docid: get_docid(doc, docid_xpt),
+          titles: get_titles(doc),
+          type: "international-standard",
+          docstatus: get_status(doc, status_xpt),
+          gbtype: gbtype,
+          ccs: get_ccs(doc),
+          ics: get_ics(doc),
+          link: [{ type: "src", content: src }],
+          dates: get_dates(doc),
+          language: ["zh"],
+          script: ["Hans"],
+          structuredidentifier: fetch_structuredidentifier(doc),
+        }
+      end
+      # rubocop:enable Metrics/MethodLength
+      def get_committee(doc)
+        {
+          name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
+          type: "technical",
+        }
+      end
+      def get_titles(doc)
+        xpath  = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
+        titles = [{ title_main: doc.xpath(xpath).text,
+                    title_intro: nil, language: "zh", script: "Hans" }]
+        xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
+        title_main = doc.xpath(xpath).text
+        unless title_main.empty?
+          titles << { title_main: title_main, title_intro: nil, language: "en",
+                      script: "Latn" }
+        end
+        titles
+      end
+      def gbtype
+        { scope: "social-group", prefix: "T", mandate: "mandatory" }
+      end
+      # def get_group_code(ref)
+      #   ref.match(%r{(?<=\/)[^\s]})
+      # end
+      def get_ccs(doc)
+        [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
+          .text.gsub(/[\r\n]/, "").strip.match(/^[^\s]+/).to_s]
+      end
+      def get_ics(doc)
+        xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
+        ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
+        field, group, subgroup = ics.split "."
+        [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
+      end
+      def get_dates(doc)
+        d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
+          .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
+        [{ type: "published", on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
+      end
+    end
+  end
+end

data/lib/relaton_gb/version.rb ADDED

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module RelatonGb
+  VERSION = "0.1.0"
+end

data/lib/relaton_gb/xml_parser.rb ADDED

@@ -0,0 +1,49 @@
+require "nokogiri"
+module RelatonGb
+  class XMLParser < RelatonIsoBib::XMLParser
+    class << self
+      def from_xml(xml)
+        doc = Nokogiri::XML(xml)
+        gbitem = doc.at "/bibitem|/bibdata"
+        GbBibliographicItem.new item_data(gbitem)
+      end
+      private
+      def item_data(gbitem)
+        data = super
+        data[:committee] = fetch_committee gbitem
+        data[:gbtype] = fetch_gbtype gbitem
+        data[:ccs] = fetch_ccs gbitem
+        data[:plannumber] = gbitem.at("./plannumber")&.text
+        data
+      end
+      # Overrade get_id from RelatonIsoBib::XMLParser
+      # def get_id(did)
+      #   did.text.match(/^(?<project>.*?\d+)(?<hyphen>-)?(?(<hyphen>)(?<year>\d*))/)
+      # end
+      def fetch_committee(doc)
+        committee = doc.at "./ext/gbcommittee"
+        return nil unless committee
+        { type: committee[:type], name: committee.text }
+      end
+      def fetch_ccs(doc)
+        doc.xpath("./ext/ccs/code").map &:text
+      end
+      def fetch_gbtype(doc)
+        gbtype = doc.at "./ext/gbtype"
+        {
+          scope: gbtype&.at("gbscope")&.text,
+          prefix: gbtype&.at("gbprefix")&.text,
+          mandate: gbtype&.at("gbmandate")&.text,
+        }
+      end
+    end
+  end
+end