RubyGems - gbbib - Versions diffs - 0.1.0 - Mend

gbbib 0.1.0

Files changed (27) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.rubocop.yml +5 -0
data/.travis.yml +5 -0
data/Gemfile +6 -0
data/Gemfile.lock +63 -0
data/LICENSE.txt +21 -0
data/README.adoc +39 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/gbbib.gemspec +35 -0
data/lib/gbbib/gb_bibliographic_item.rb +58 -0
data/lib/gbbib/gb_bibliography.rb +35 -0
data/lib/gbbib/gb_scrapper.rb +50 -0
data/lib/gbbib/gb_standard_type.rb +30 -0
data/lib/gbbib/gb_technical_committee.rb +23 -0
data/lib/gbbib/hit.rb +58 -0
data/lib/gbbib/hit_collection.rb +45 -0
data/lib/gbbib/scrapper.rb +159 -0
data/lib/gbbib/sec_scrapper.rb +51 -0
data/lib/gbbib/t_scrapper.rb +116 -0
data/lib/gbbib/version.rb +3 -0
data/lib/gbbib/yaml/prefixes.yaml +197 -0
data/lib/gbbib.rb +2 -0
metadata +169 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6e45fa975e5e4cc8f444bd95318f654d51e4cf49
+  data.tar.gz: 0a15ef16df85fb6881e255bfeb720994a3ebe8bf
+SHA512:
+  metadata.gz: 63a535f450bad7e4160441d0ef5c9bb27177d28fbee9381e9b62a69a1afe15d76adaca3349702a0056312fe40d1ae0e97401094ce11d440b599d0f7eedc94a2f
+  data.tar.gz: ca2ddb4662296000b02b20709edc4727923a453cff78b84da4e3612a67dd72cf701bde041107905203d17b2068f0447a43bc9e00791f5112211a5598c8c785f5

data/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,5 @@
+AllCops:
+  TargetRubyVersion: 2.4.0
+Style/Encoding:
+  Enabled: false

data/.travis.yml ADDED Viewed

@@ -0,0 +1,5 @@
+sudo: false
+language: ruby
+rvm:
+  - 2.4.0
+before_install: gem install bundler -v 1.16.1

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in gdbib.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,63 @@
+PATH
+  remote: .
+  specs:
+    gbbib (0.1.0)
+      cnccs
+      iso-bib-item
+GEM
+  remote: https://rubygems.org/
+  specs:
+    byebug (10.0.0)
+    cnccs (0.1.1)
+    coderay (1.1.2)
+    diff-lcs (1.3)
+    docile (1.1.5)
+    iso-bib-item (0.1.4)
+      isoics (~> 0.1.6)
+      nokogiri
+    isoics (0.1.6)
+    json (2.1.0)
+    method_source (0.9.0)
+    mini_portile2 (2.3.0)
+    nokogiri (1.8.2)
+      mini_portile2 (~> 2.3.0)
+    pry (0.11.3)
+      coderay (~> 1.1.0)
+      method_source (~> 0.9.0)
+    pry-byebug (3.6.0)
+      byebug (~> 10.0)
+      pry (~> 0.10)
+    rake (10.5.0)
+    rspec (3.7.0)
+      rspec-core (~> 3.7.0)
+      rspec-expectations (~> 3.7.0)
+      rspec-mocks (~> 3.7.0)
+    rspec-core (3.7.1)
+      rspec-support (~> 3.7.0)
+    rspec-expectations (3.7.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.7.0)
+    rspec-mocks (3.7.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.7.0)
+    rspec-support (3.7.1)
+    simplecov (0.15.1)
+      docile (~> 1.1.0)
+      json (>= 1.8, < 3)
+      simplecov-html (~> 0.10.0)
+    simplecov-html (0.10.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.16)
+  gbbib!
+  pry-byebug
+  rake (~> 10.0)
+  rspec (~> 3.0)
+  simplecov
+BUNDLED WITH
+   1.16.1

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2018 Andrei Kislichenko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.adoc ADDED Viewed

@@ -0,0 +1,39 @@
+# Gdbib
+Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gdbib`. To experiment with that code, run `bin/console` for an interactive prompt.
+TODO: Delete this and the text above, and describe your gem
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'gdbib'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install gdbib
+## Usage
+TODO: Write usage instructions here
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gdbib.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "gdbib"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/gbbib.gemspec ADDED Viewed

@@ -0,0 +1,35 @@
+lib = File.expand_path('lib', __dir__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'gbbib/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'gbbib'
+  spec.version       = Gbbib::VERSION
+  spec.authors       = ['Ribose Inc.']
+  spec.email         = ['pen.source@ribose.com']
+  spec.summary       = 'GdBib: retrieve Chinese GB Standards for bibliographic'\
+                       ' use using the BibliographicItem model.'
+  spec.description   = 'GdBib: retrieve Chinese GB Standards for bibliographic'\
+                       ' use using the BibliographicItem model.'
+  spec.homepage      = 'https://github.com/riboseinc/gdbib'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler', '~> 1.16'
+  spec.add_development_dependency 'pry-byebug'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+  spec.add_development_dependency 'simplecov'
+  spec.add_dependency 'cnccs'
+  spec.add_dependency 'iso-bib-item'
+  # spec.add_dependency 'nokogiri'
+end

data/lib/gbbib/gb_bibliographic_item.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+require 'iso_bib_item'
+require 'cnccs'
+require 'gbbib/gb_technical_committee'
+require 'gbbib/gb_standard_type'
+module Gbbib
+  # GB bibliographic item class.
+  class GbBibliographicItem < IsoBibItem::IsoBibliographicItem
+    # @return [Gbbib::GbTechnicalCommittee]
+    attr_reader :committee
+    # @return [Gbbib::GbStandardType]
+    attr_reader :gbtype
+    # @return [String]
+    attr_reader :topic
+    # @return [Array<Cnccs::Ccs>]
+    attr_reader :ccs
+    # @return [String]
+    attr_reader :plan_number
+    # @return [String]
+    attr_reader :type
+    def initialize(**args)
+      super
+      @committee = GbTechnicalCommittee.new args[:committee]
+      @ccs = args[:ccs].map { |c| Cnccs.fetch c }
+      @gbtype = GbStandardType.new args[:gbtype]
+      @type = args[:type]
+    end
+    # @param builder [Nokogiri::XML::Builder]
+    # @return [String]
+    def to_xml(builder = nil, **opts)
+      if builder
+        super(builder, opts) { |xml| render_gbxml(xml) }
+      else
+        Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |bldr|
+          super(bldr, opts) { |xml| render_gbxml(xml) }
+        end.doc.root.to_xml
+      end
+    end
+    private
+    # @param builder [Nokogiri::XML::Builder]
+    def render_gbxml(builder)
+      committee.to_xml builder
+      gbtype.to_xml builder
+      ccs.each { |c| builder.ccs c.description } if ccs.any?
+    end
+  end
+end

data/lib/gbbib/gb_bibliography.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+# GB bib module.
+module Gbbib
+  # GB entry point class.
+  class GbBibliography
+    class << self
+      # rubocop:disable Metrics/MethodLength
+      # @param text [Strin] code of standard for search
+      # @return [Gbbib::Hits]
+      def search(text)
+        if text.match?(/^(GB|GJ|GS)/)
+          # Scrape national standards.
+          require 'gbbib/gb_scrapper'
+          GbScrapper.scrape_page text
+        elsif text.match?(/^ZB/)
+          # Scrape proffesional.
+        elsif text.match?(/^DB/)
+          # Scrape local standard.
+        elsif text.match? %r{^Q\/}
+          # Enterprise standard
+        elsif text.match? %r{^T\/[^\s]{3,6}\s}
+          # Scrape social standard.
+          require 'gbbib/t_scrapper'
+          TScrapper.scrape_page text
+        else
+          # Scrape sector standard.
+          require 'gbbib/sec_scrapper'
+          SecScrapper.scrape_page text
+        end
+      end
+      # rubocop:enable Metrics/MethodLength
+    end
+  end
+end

data/lib/gbbib/gb_scrapper.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require 'open-uri'
+require 'nokogiri'
+require 'gbbib/scrapper'
+require 'gbbib/gb_bibliographic_item'
+require 'gbbib/gb_standard_type'
+require 'gbbib/hit_collection'
+require 'gbbib/hit'
+module Gbbib
+  # National standard scrapper.
+  module GbScrapper
+    extend Scrapper
+    class << self
+      # @param text [Strin] code of standard for serarch
+      # @return [Gbbib::HitCollection]
+      def scrape_page(text)
+        search_html = OpenURI.open_uri(
+          'http://www.std.gov.cn/search/stdPage?q=' + text
+        )
+        result = Nokogiri::HTML search_html
+        hits = result.css('.s-title a').map do |h|
+          Hit.new pid: h[:pid], title: h.text, scrapper: self
+        end
+        HitCollection.new hits
+      end
+      # @param pid [Strin] standard's page id
+      # @return [Gbbib::GbBibliographicItem]
+      def scrape_doc(pid)
+        src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
+        doc = Nokogiri::HTML OpenURI.open_uri(src)
+        GbBibliographicItem.new scrapped_data(doc, src: src)
+      end
+      # @param doc [Nokogiri::HTML]
+      # @return [Hash]
+      #   * :type [String]
+      #   * :name [String]
+      def get_committee(doc)
+        name = doc.xpath('//p/a[1]/following-sibling::text()').text
+                  .match(/(?<=（)[^）]+/).to_s
+        { type: 'technical', name: name }
+      end
+    end
+  end
+end

data/lib/gbbib/gb_standard_type.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Gbbib
+  # GB standard type.
+  class GbStandardType
+    # @return [String]
+    attr_reader :scope
+    # @return [String]
+    attr_reader :prefix
+    # @return [String]
+    attr_reader :mandate
+    # @param scope [String]
+    # @param prefix [String]
+    # @param mandate [String]
+    def initialize(scope:, prefix:, mandate:)
+      @scope   = scope
+      @prefix  = prefix
+      @mandate = mandate
+    end
+    def to_xml(builder)
+      builder.gbtype do
+        builder.gbscope @scope
+        builder.gbprefix @prefix
+        builder.gbmandate @mandate
+      end
+    end
+  end
+end

data/lib/gbbib/gb_technical_committee.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Gbbib
+  # GB technical committee.
+  class GbTechnicalCommittee
+    # @return [String]
+    attr_reader :type
+    # @return [String]
+    attr_reader :name
+    # @param type [String]
+    # @param name [String]
+    def initialize(type:, name:)
+      @type = type
+      @name = name
+    end
+    def to_xml(builder)
+      builder.gbcommittee(type: @type) do
+        builder.text @name
+      end
+    end
+  end
+end

data/lib/gbbib/hit.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Gbbib
+  # Hit.
+  class Hit
+    # @return [Isobib::HitCollection]
+    attr_reader :hit_collection
+    # @return [String]
+    attr_reader :pid
+    # @return [String]
+    attr_reader :title
+    # @return [Gbbib::GbScrapper, Gbbib::SecScraper, Gbbib::TScrapper]
+    attr_reader :scrapper
+    # @param hit [Hash]
+    # @param hit_collection [Isobib:HitCollection]
+    def initialize(pid:, title:, hit_collection: nil, scrapper:)
+      @pid            = pid
+      @title          = title
+      @hit_collection = hit_collection
+      @scrapper       = scrapper
+      self.hit_collection << self if hit_collection
+    end
+    # Parse page.
+    # @return [Isobib::IsoBibliographicItem]
+    def fetch
+      @fetch ||= scrapper.scrape_doc pid
+    end
+    # @return [String]
+    def to_s
+      inspect
+    end
+    # @return [String]
+    def inspect
+      "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
+      "@fullIdentifier=\"#{@fetch&.shortref}\" "\
+      "@title=\"#{title}\">"
+    end
+    # @return [String]
+    def to_xml(builder = nil, opts = {})
+      if builder
+        fetch.to_xml builder, opts
+      else
+        builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
+          fetch.to_xml xml, opts
+        end
+        builder.doc.root.to_xml
+      end
+    end
+  end
+end

data/lib/gbbib/hit_collection.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+module Gbbib
+  # Page of hit collection
+  class HitCollection < Array
+    # @return [TrueClass, FalseClass]
+    attr_reader :fetched
+    # @return [Isobib::HitPages]
+    attr_reader :hit_pages
+    # @return [Gbbib::GbScrapper, Gbbib::SecScrapper, Gbbib::TScrapper]
+    attr_reader :scrapper
+    # @param hits [Array<Hash>]
+    # @param hit_pages [Integer]
+    # @param scrapper [Gbbib::GbScrapper, Gbbib::SecScrapper, Gbbib::TScrapper]
+    def initialize(hits = nil, hit_pages = nil)
+      concat hits
+      @fetched   = false
+      @hit_pages = hit_pages
+    end
+    # @return [Isobib::HitCollection]
+    # def fetch
+    #   workers = WorkersPool.new 4
+    #   workers.worker(&:fetch)
+    #   each do |hit|
+    #     workers << hit
+    #   end
+    #   workers.end
+    #   workers.result
+    #   @fetched = true
+    #   self
+    # end
+    # def to_s
+    #   inspect
+    # end
+    #
+    # def inspect
+    # "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
+    # end
+  end
+end

data/lib/gbbib/scrapper.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require 'yaml'
+module Gbbib
+  # Common scrapping methods.
+  module Scrapper
+    @prefixes = nil
+    # rubocop:disable Metrics/MethodLength
+    # @param doc [Nokogiri::HTML::Document]
+    # @param src [String] url of scrapped page
+    # @return [Hash]
+    def scrapped_data(doc, src:)
+      {
+        committee: get_committee(doc),
+        docid:     get_docid(doc),
+        titles:    get_titles(doc),
+        type:      get_type(doc),
+        docstatus: get_status(doc),
+        gbtype:    get_gbtype(doc),
+        ccs:       get_ccs(doc),
+        ics:       get_ics(doc),
+        source:    [{ type: 'src', content: src }],
+        dates:     get_dates(doc),
+        language:  ['zh'],
+        script:    ['Hans']
+      }
+    end
+    # rubocop:enable Metrics/MethodLength
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    #   * :project_number [String]
+    #   * :part_number [String]
+    def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
+      item_ref = doc.xpath(xpt)
+                    .text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
+      { project_number: item_ref[1], part_number: item_ref[2] }
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :title_intro [String]
+    #   * :title_main [String]
+    #   * :language [String]
+    #   * :script [String]
+    def get_titles(doc)
+      titles = [{ title_intro: doc.css('div.page-header h4').text,
+                  title_main: '', language: 'zh', script: 'Hans' }]
+      title_intro = doc.css('div.page-header h5').text
+      unless title_intro.empty?
+        titles << { title_intro: title_intro, title_main: '', language: 'en',
+                    script: 'Latn' }
+      end
+      titles
+    end
+    def get_type(_doc)
+      'standard'
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    #   * :status [String]
+    #   * :stage [String]
+    #   * :substage [String]
+    def get_status(doc, xpt = '.s-status.label:nth-child(3)')
+      status = case doc.at(xpt).text.gsub(/\s/, '')
+               when '即将实施' then 'published'
+               when '现行' then 'activated'
+               when '废止' then 'obsoleted'
+               end
+      { status: status, stage: '', substage: '' }
+    end
+    private
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Hash]
+    #   * :scope [String]
+    #   * :prefix [String]
+    #   * :mandate [String]
+    def get_gbtype(doc)
+      ref = get_ref(doc)
+      { scope: get_scope(doc), prefix: get_prefix(ref)['prefix'],
+        mandate: get_mandate(ref) }
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [String]
+    def get_ref(doc)
+      doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<String>]
+    def get_ccs(doc)
+      [doc.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]').text]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :field [String]
+    #   * :group [String]
+    #   * :subgroup [String]
+    def get_ics(doc)
+      ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
+      field, group, subgroup = ics.text.split '.'
+      [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [String]
+    def get_scope(doc)
+      scope = doc.at('.s-status.label-info').text
+      if scope == '国家标准'
+        'national'
+      elsif scope.match?(/^行业标准/)
+        'sector'
+      end
+    end
+    # @param ref [String]
+    # @return [String]
+    def get_prefix(ref)
+      pref = ref.match(/^[^\s]+/).to_s.split('/').first
+      prefix pref
+    end
+    # @param pref [String]
+    # @return [Hash{String=>String}]
+    def prefix(pref)
+      file_path = File.join(__dir__, 'yaml/prefixes.yaml')
+      @prefixes ||= YAML.load_file(file_path)
+      @prefixes[pref]
+    end
+    # @param ref [String]
+    # @return [String]
+    def get_mandate(ref)
+      case ref.match(%r{(?<=\/)[^\s]+}).to_s
+      when 'T' then 'recommended'
+      when 'Z' then 'guidelines'
+      else 'mandatory'
+      end
+    end
+    # @param doc [Nokogiri::HTML::Document]
+    # @return [Array<Hash>]
+    #   * :type [String] type of date
+    #   * :on [String] date
+    def get_dates(doc)
+      date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
+      [{ type: 'published', on: date }]
+    end
+  end
+end

data/lib/gbbib/sec_scrapper.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require 'net/http'
+require 'json'
+require 'nokogiri'
+require 'gbbib/scrapper'
+require 'gbbib/gb_bibliographic_item'
+require 'gbbib/hit_collection'
+require 'gbbib/hit'
+module Gbbib
+  # Sector standard scrapper
+  module SecScrapper
+    extend Scrapper
+    class << self
+      # @param text [String] code of standard for serarch
+      # @return [Gbbib::HitCollection]
+      def scrape_page(text)
+        uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
+        res = JSON.parse Net::HTTP.get(uri)
+        hits = res['rows'].map do |r|
+          Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
+        end
+        HitCollection.new hits
+      end
+      # @param pid [String] standard's page id
+      # @return [Gbbib::GbBibliographicItem]
+      def scrape_doc(pid)
+        src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
+        page_uri = URI src
+        doc = Nokogiri::HTML Net::HTTP.get(page_uri)
+        GbBibliographicItem.new scrapped_data(doc, src: src)
+      end
+      private
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Hash]
+      #   * :type [String]
+      #   * :name [String]
+      def get_committee(doc)
+        ref = get_ref(doc)
+        name = get_prefix(ref)['administration']
+        { type: 'technical', name: name }
+      end
+    end
+  end
+end

data/lib/gbbib/t_scrapper.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+require 'open-uri'
+require 'nokogiri'
+require 'gbbib/scrapper'
+require 'gbbib/gb_bibliographic_item'
+require 'gbbib/hit_collection'
+require 'gbbib/hit'
+module Gbbib
+  # Social standard scarpper.
+  module TScrapper
+    extend Scrapper
+    class << self
+      # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
+      # @param text [String]
+      # @return [Gbbib::HitCollection]
+      def scrape_page(text)
+        search_html = OpenURI.open_uri(
+          'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
+          CGI.escape(text.tr('-', [8212].pack('U')))
+        )
+        header = Nokogiri::HTML search_html
+        xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
+        t_xpath = '../preceding-sibling::td[3]'
+        hits = header.xpath(xpath).map do |h|
+          title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
+          Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
+        end
+        HitCollection.new hits
+      end
+      # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
+      # @param pid [String] standard's page path
+      # @return [Gbbib::GbBibliographicItem]
+      def scrape_doc(pid)
+        src = "http://www.ttbz.org.cn#{pid}"
+        doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
+        GbBibliographicItem.new scrapped_data(doc, src: src)
+      end
+      private
+      # rubocop:disable Metrics/MethodLength
+      # @param doc [Nokogiri::HTML::Document]
+      # @return [Hash]
+      def scrapped_data(doc, src:)
+        docid_xpt  = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
+        status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
+        {
+          committee: get_committee(doc),
+          docid:     get_docid(doc, docid_xpt),
+          titles:    get_titles(doc),
+          type:      'standard',
+          docstatus: get_status(doc, status_xpt),
+          gbtype:    gbtype,
+          ccs:       get_ccs(doc),
+          ics:       get_ics(doc),
+          source:    [{ type: 'src', content: src }],
+          dates:     get_dates(doc),
+          language:  ['zh'],
+          script:    ['Hans']
+        }
+      end
+      # rubocop:enable Metrics/MethodLength
+      def get_committee(doc)
+        {
+          name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
+          type: 'technical'
+        }
+      end
+      def get_titles(doc)
+        xpath  = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
+        titles = [{ title_intro: doc.xpath(xpath).text,
+                    title_main: '', language: 'zh', script: 'Hans' }]
+        xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
+        title_intro = doc.xpath(xpath).text
+        unless title_intro.empty?
+          titles << { title_intro: title_intro, title_main: '', language: 'en',
+                      script: 'Latn' }
+        end
+        titles
+      end
+      def gbtype
+        { scope: 'social-group', prefix: 'T', mandate: 'mandatory' }
+      end
+      # def get_group_code(ref)
+      #   ref.match(%r{(?<=\/)[^\s]})
+      # end
+      def get_ccs(doc)
+        [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
+            .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
+      end
+      def get_ics(doc)
+        xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
+        ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
+        field, group, subgroup = ics.split '.'
+        [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
+      end
+      def get_dates(doc)
+        d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
+               .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
+        [{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
+      end
+    end
+  end
+end

data/lib/gbbib/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Gbbib
+  VERSION = '0.1.0'
+end

data/lib/gbbib/yaml/prefixes.yaml ADDED Viewed

@@ -0,0 +1,197 @@
+# GbStandardNationalPrefix
+GB:
+  prefix: GB_national
+GBn:
+  prefix: GBn_confidential
+GJB:
+  prefix: GJB_military
+GSB:
+  prefix: GSB_physical
+# GbStandardSectorPrefix
+NY:
+  prefix: NY_agriculture
+  administration: 农业部
+SC:
+  prefix: SC_aquatic
+  administration: 农业部
+SL:
+  prefix: SL_water_resources
+  administration: 水利部
+QB:
+  prefix: QB_light_industry
+  administration: 国家发改委
+FZ:
+  prefix: FZ_textile
+  administration: 国家发改委
+YY:
+  prefix: YY_medicine
+  administration: 国家食品药品监督管理局
+MZ:
+  prefix: MZ_civil_affairs
+  administration: 民政部
+JY:
+  prefix: JY_education
+  administration: 教育部
+YC:
+  prefix: YC_tobacco
+  administration: 国家烟草专卖局
+YB:
+  prefix: YB_ferrous_metallurgy
+  administration: 国家发改委
+YS:
+  prefix: YS_nonferrous_metallurgy
+  administration: 国家发改委
+SY:
+  prefix: SY_natural_gas
+  administration: 国家发改委
+HG:
+  prefix: HG_chemical_industry
+  administration: 国家发改委
+SH:
+  prefix: SH_petrochemical
+  administration: 国家发改委
+JC:
+  prefix: JC_building_materials
+  administration: 国家发改委
+DZ:
+  prefix: DZ_geological
+  administration: 国土资源部
+TD:
+  prefix: TD_land_management
+  administration: 国土资源部
+CH:
+  prefix: CH_surveying
+  administration: 国家测绘局
+JB:
+  prefix: JB_mechanical
+  administration: 国家发改委
+QC:
+  prefix: QC_automotile
+  administration: 国家发改委
+MH:
+  prefix: MH_civil_aviation
+  administration: 中国民航管理总局
+WJ:
+  prefix: WJ_ordnance
+  administration: 国防科学工业委员会
+CB:
+  prefix: CB_ships
+  administration: 国防科学工业委员会
+HB:
+  prefix: HB_aviation
+  administration: 国防科学工业委员会
+QJ:
+  prefix: QJ_aerospace
+  administration: 国防科学工业委员会
+EJ:
+  prefix: EJ_nuclear_industry
+  administration: 国防科学工业委员会
+TB:
+  prefix: TB_rail_transport
+  administration: 铁道部
+JT:
+  prefix: JT_traffic
+  administration: 交通部
+LD:
+  prefix: LD_work_safety
+  administration: 劳动和社会保障部
+SJ:
+  prefix: SJ_electronics
+  administration: 信息产业部
+YD:
+  prefix: YD_communication
+  administration: 信息产业部
+GY:
+  prefix: GY_media
+  administration: 国家广播电影电视总局
+DL:
+  prefix: DL_electricity
+  administration: 国家发改委
+JR:
+  prefix: JR_financial
+  administration: 中国人民银行
+HY:
+  prefix: HY_oceanic
+  administration: 国家海洋局
+DA:
+  prefix: DA_archiving
+  administration: 国家档案局
+SN:
+  prefix: SN_inspection
+  administration: 国家质量监督检验检疫总局
+WH:
+  prefix: WH_culture
+  administration: 文化部
+TY:
+  prefix: TY_sports
+  administration: 国家体育总局
+SB:
+  prefix: SB_business
+  administration: 商务部
+WB:
+  prefix: WB_materials_management
+  administration: 国家发改委
+HJ:
+  prefix: HJ_environment_protection
+  administration: 国家环境保护总局
+XB:
+  prefix: XB_rare_earth
+  administration: 国家发改委稀土办公室
+CJ:
+  prefix: CJ_town_construction
+  administration: 建设部
+JG:
+  prefix: JG_construction_industry
+  administration: 建设部
+CY:
+  prefix: CY_news_publishing
+  administration: 国家新闻出版总署
+MT:
+  prefix: MT_coal
+  administration: 国家发改委
+WS:
+  prefix: WS_health
+  administration: 卫生部
+GA:
+  prefix: GA_public_safety
+  administration: 公安部
+BB:
+  prefix: BB_packaging
+  administration: 国家发改委
+DB:
+  prefix: DB_seismology
+  administration: 中国地震局
+LB:
+  prefix: LB_tourism
+  administration: 国家旅游局
+QX:
+  prefix: QX_meteorological
+  administration: 中国气象局
+WM:
+  prefix: WM_foreign_trade
+  administration: 外经贸部科技司
+HS:
+  prefix: HS_customs
+  administration: 海关总署
+YZ:
+  prefix: YZ_postal
+  administration: 国家邮政局
+GM:
+  prefix: GM_cryptography
+  administration: 国家密码管理局
+AQ:
+  prefix: AQ_production_safety
+  administration: 国家安全生产管理局
+GH:
+  prefix: GH_supply_marketing
+  administration: 中华全国供销合作总社
+LS:
+  prefix: LS_food
+  administration: 国家粮食局
+TJ:
+  prefix: TJ_railway_traffic
+  administration: 铁道部标准所
+ZY:
+  prefix: ZY_chinese_medicine
+  administration: 国家中医药管理局

data/lib/gbbib.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'gbbib/version'
2	+ require 'gbbib/gb_bibliography'

metadata ADDED Viewed

@@ -0,0 +1,169 @@
+--- !ruby/object:Gem::Specification
+name: gbbib
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Ribose Inc.
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2018-06-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+- !ruby/object:Gem::Dependency
+  name: pry-byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: cnccs
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: iso-bib-item
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the
+  BibliographicItem model.'
+email:
+- pen.source@ribose.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- ".travis.yml"
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.adoc
+- Rakefile
+- bin/console
+- bin/setup
+- gbbib.gemspec
+- lib/gbbib.rb
+- lib/gbbib/gb_bibliographic_item.rb
+- lib/gbbib/gb_bibliography.rb
+- lib/gbbib/gb_scrapper.rb
+- lib/gbbib/gb_standard_type.rb
+- lib/gbbib/gb_technical_committee.rb
+- lib/gbbib/hit.rb
+- lib/gbbib/hit_collection.rb
+- lib/gbbib/scrapper.rb
+- lib/gbbib/sec_scrapper.rb
+- lib/gbbib/t_scrapper.rb
+- lib/gbbib/version.rb
+- lib/gbbib/yaml/prefixes.yaml
+homepage: https://github.com/riboseinc/gdbib
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.12
+signing_key:
+specification_version: 4
+summary: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the BibliographicItem
+  model.'
+test_files: []