gbbib 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e45fa975e5e4cc8f444bd95318f654d51e4cf49
4
+ data.tar.gz: 0a15ef16df85fb6881e255bfeb720994a3ebe8bf
5
+ SHA512:
6
+ metadata.gz: 63a535f450bad7e4160441d0ef5c9bb27177d28fbee9381e9b62a69a1afe15d76adaca3349702a0056312fe40d1ae0e97401094ce11d440b599d0f7eedc94a2f
7
+ data.tar.gz: ca2ddb4662296000b02b20709edc4727923a453cff78b84da4e3612a67dd72cf701bde041107905203d17b2068f0447a43bc9e00791f5112211a5598c8c785f5
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,5 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4.0
3
+
4
+ Style/Encoding:
5
+ Enabled: false
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.4.0
5
+ before_install: gem install bundler -v 1.16.1
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in gdbib.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,63 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gbbib (0.1.0)
5
+ cnccs
6
+ iso-bib-item
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ byebug (10.0.0)
12
+ cnccs (0.1.1)
13
+ coderay (1.1.2)
14
+ diff-lcs (1.3)
15
+ docile (1.1.5)
16
+ iso-bib-item (0.1.4)
17
+ isoics (~> 0.1.6)
18
+ nokogiri
19
+ isoics (0.1.6)
20
+ json (2.1.0)
21
+ method_source (0.9.0)
22
+ mini_portile2 (2.3.0)
23
+ nokogiri (1.8.2)
24
+ mini_portile2 (~> 2.3.0)
25
+ pry (0.11.3)
26
+ coderay (~> 1.1.0)
27
+ method_source (~> 0.9.0)
28
+ pry-byebug (3.6.0)
29
+ byebug (~> 10.0)
30
+ pry (~> 0.10)
31
+ rake (10.5.0)
32
+ rspec (3.7.0)
33
+ rspec-core (~> 3.7.0)
34
+ rspec-expectations (~> 3.7.0)
35
+ rspec-mocks (~> 3.7.0)
36
+ rspec-core (3.7.1)
37
+ rspec-support (~> 3.7.0)
38
+ rspec-expectations (3.7.0)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.7.0)
41
+ rspec-mocks (3.7.0)
42
+ diff-lcs (>= 1.2.0, < 2.0)
43
+ rspec-support (~> 3.7.0)
44
+ rspec-support (3.7.1)
45
+ simplecov (0.15.1)
46
+ docile (~> 1.1.0)
47
+ json (>= 1.8, < 3)
48
+ simplecov-html (~> 0.10.0)
49
+ simplecov-html (0.10.2)
50
+
51
+ PLATFORMS
52
+ ruby
53
+
54
+ DEPENDENCIES
55
+ bundler (~> 1.16)
56
+ gbbib!
57
+ pry-byebug
58
+ rake (~> 10.0)
59
+ rspec (~> 3.0)
60
+ simplecov
61
+
62
+ BUNDLED WITH
63
+ 1.16.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Andrei Kislichenko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.adoc ADDED
@@ -0,0 +1,39 @@
1
+ # Gdbib
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/gdbib`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'gdbib'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install gdbib
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/gdbib.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "gdbib"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/gbbib.gemspec ADDED
@@ -0,0 +1,35 @@
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gbbib/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gbbib'
8
+ spec.version = Gbbib::VERSION
9
+ spec.authors = ['Ribose Inc.']
10
+ spec.email = ['pen.source@ribose.com']
11
+
12
+ spec.summary = 'GdBib: retrieve Chinese GB Standards for bibliographic'\
13
+ ' use using the BibliographicItem model.'
14
+ spec.description = 'GdBib: retrieve Chinese GB Standards for bibliographic'\
15
+ ' use using the BibliographicItem model.'
16
+ spec.homepage = 'https://github.com/riboseinc/gdbib'
17
+ spec.license = 'MIT'
18
+
19
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
20
+ f.match(%r{^(test|spec|features)/})
21
+ end
22
+ spec.bindir = 'exe'
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = ['lib']
25
+
26
+ spec.add_development_dependency 'bundler', '~> 1.16'
27
+ spec.add_development_dependency 'pry-byebug'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'rspec', '~> 3.0'
30
+ spec.add_development_dependency 'simplecov'
31
+
32
+ spec.add_dependency 'cnccs'
33
+ spec.add_dependency 'iso-bib-item'
34
+ # spec.add_dependency 'nokogiri'
35
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'iso_bib_item'
4
+ require 'cnccs'
5
+ require 'gbbib/gb_technical_committee'
6
+ require 'gbbib/gb_standard_type'
7
+
8
+ module Gbbib
9
+ # GB bibliographic item class.
10
+ class GbBibliographicItem < IsoBibItem::IsoBibliographicItem
11
+ # @return [Gbbib::GbTechnicalCommittee]
12
+ attr_reader :committee
13
+
14
+ # @return [Gbbib::GbStandardType]
15
+ attr_reader :gbtype
16
+
17
+ # @return [String]
18
+ attr_reader :topic
19
+
20
+ # @return [Array<Cnccs::Ccs>]
21
+ attr_reader :ccs
22
+
23
+ # @return [String]
24
+ attr_reader :plan_number
25
+
26
+ # @return [String]
27
+ attr_reader :type
28
+
29
+ def initialize(**args)
30
+ super
31
+ @committee = GbTechnicalCommittee.new args[:committee]
32
+ @ccs = args[:ccs].map { |c| Cnccs.fetch c }
33
+ @gbtype = GbStandardType.new args[:gbtype]
34
+ @type = args[:type]
35
+ end
36
+
37
+ # @param builder [Nokogiri::XML::Builder]
38
+ # @return [String]
39
+ def to_xml(builder = nil, **opts)
40
+ if builder
41
+ super(builder, opts) { |xml| render_gbxml(xml) }
42
+ else
43
+ Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |bldr|
44
+ super(bldr, opts) { |xml| render_gbxml(xml) }
45
+ end.doc.root.to_xml
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # @param builder [Nokogiri::XML::Builder]
52
+ def render_gbxml(builder)
53
+ committee.to_xml builder
54
+ gbtype.to_xml builder
55
+ ccs.each { |c| builder.ccs c.description } if ccs.any?
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # GB bib module.
4
+ module Gbbib
5
+ # GB entry point class.
6
+ class GbBibliography
7
+ class << self
8
+ # rubocop:disable Metrics/MethodLength
9
+ # @param text [Strin] code of standard for search
10
+ # @return [Gbbib::Hits]
11
+ def search(text)
12
+ if text.match?(/^(GB|GJ|GS)/)
13
+ # Scrape national standards.
14
+ require 'gbbib/gb_scrapper'
15
+ GbScrapper.scrape_page text
16
+ elsif text.match?(/^ZB/)
17
+ # Scrape proffesional.
18
+ elsif text.match?(/^DB/)
19
+ # Scrape local standard.
20
+ elsif text.match? %r{^Q\/}
21
+ # Enterprise standard
22
+ elsif text.match? %r{^T\/[^\s]{3,6}\s}
23
+ # Scrape social standard.
24
+ require 'gbbib/t_scrapper'
25
+ TScrapper.scrape_page text
26
+ else
27
+ # Scrape sector standard.
28
+ require 'gbbib/sec_scrapper'
29
+ SecScrapper.scrape_page text
30
+ end
31
+ end
32
+ # rubocop:enable Metrics/MethodLength
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,50 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'open-uri'
5
+ require 'nokogiri'
6
+ require 'gbbib/scrapper'
7
+ require 'gbbib/gb_bibliographic_item'
8
+ require 'gbbib/gb_standard_type'
9
+ require 'gbbib/hit_collection'
10
+ require 'gbbib/hit'
11
+
12
+ module Gbbib
13
+ # National standard scrapper.
14
+ module GbScrapper
15
+ extend Scrapper
16
+
17
+ class << self
18
+ # @param text [Strin] code of standard for serarch
19
+ # @return [Gbbib::HitCollection]
20
+ def scrape_page(text)
21
+ search_html = OpenURI.open_uri(
22
+ 'http://www.std.gov.cn/search/stdPage?q=' + text
23
+ )
24
+ result = Nokogiri::HTML search_html
25
+ hits = result.css('.s-title a').map do |h|
26
+ Hit.new pid: h[:pid], title: h.text, scrapper: self
27
+ end
28
+ HitCollection.new hits
29
+ end
30
+
31
+ # @param pid [Strin] standard's page id
32
+ # @return [Gbbib::GbBibliographicItem]
33
+ def scrape_doc(pid)
34
+ src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
35
+ doc = Nokogiri::HTML OpenURI.open_uri(src)
36
+ GbBibliographicItem.new scrapped_data(doc, src: src)
37
+ end
38
+
39
+ # @param doc [Nokogiri::HTML]
40
+ # @return [Hash]
41
+ # * :type [String]
42
+ # * :name [String]
43
+ def get_committee(doc)
44
+ name = doc.xpath('//p/a[1]/following-sibling::text()').text
45
+ .match(/(?<=()[^)]+/).to_s
46
+ { type: 'technical', name: name }
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,30 @@
1
+ module Gbbib
2
+ # GB standard type.
3
+ class GbStandardType
4
+ # @return [String]
5
+ attr_reader :scope
6
+
7
+ # @return [String]
8
+ attr_reader :prefix
9
+
10
+ # @return [String]
11
+ attr_reader :mandate
12
+
13
+ # @param scope [String]
14
+ # @param prefix [String]
15
+ # @param mandate [String]
16
+ def initialize(scope:, prefix:, mandate:)
17
+ @scope = scope
18
+ @prefix = prefix
19
+ @mandate = mandate
20
+ end
21
+
22
+ def to_xml(builder)
23
+ builder.gbtype do
24
+ builder.gbscope @scope
25
+ builder.gbprefix @prefix
26
+ builder.gbmandate @mandate
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ module Gbbib
2
+ # GB technical committee.
3
+ class GbTechnicalCommittee
4
+ # @return [String]
5
+ attr_reader :type
6
+
7
+ # @return [String]
8
+ attr_reader :name
9
+
10
+ # @param type [String]
11
+ # @param name [String]
12
+ def initialize(type:, name:)
13
+ @type = type
14
+ @name = name
15
+ end
16
+
17
+ def to_xml(builder)
18
+ builder.gbcommittee(type: @type) do
19
+ builder.text @name
20
+ end
21
+ end
22
+ end
23
+ end
data/lib/gbbib/hit.rb ADDED
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gbbib
4
+ # Hit.
5
+ class Hit
6
+ # @return [Isobib::HitCollection]
7
+ attr_reader :hit_collection
8
+
9
+ # @return [String]
10
+ attr_reader :pid
11
+
12
+ # @return [String]
13
+ attr_reader :title
14
+
15
+ # @return [Gbbib::GbScrapper, Gbbib::SecScraper, Gbbib::TScrapper]
16
+ attr_reader :scrapper
17
+
18
+ # @param hit [Hash]
19
+ # @param hit_collection [Isobib:HitCollection]
20
+ def initialize(pid:, title:, hit_collection: nil, scrapper:)
21
+ @pid = pid
22
+ @title = title
23
+ @hit_collection = hit_collection
24
+ @scrapper = scrapper
25
+ self.hit_collection << self if hit_collection
26
+ end
27
+
28
+ # Parse page.
29
+ # @return [Isobib::IsoBibliographicItem]
30
+ def fetch
31
+ @fetch ||= scrapper.scrape_doc pid
32
+ end
33
+
34
+ # @return [String]
35
+ def to_s
36
+ inspect
37
+ end
38
+
39
+ # @return [String]
40
+ def inspect
41
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
42
+ "@fullIdentifier=\"#{@fetch&.shortref}\" "\
43
+ "@title=\"#{title}\">"
44
+ end
45
+
46
+ # @return [String]
47
+ def to_xml(builder = nil, opts = {})
48
+ if builder
49
+ fetch.to_xml builder, opts
50
+ else
51
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
52
+ fetch.to_xml xml, opts
53
+ end
54
+ builder.doc.root.to_xml
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gbbib
4
+ # Page of hit collection
5
+ class HitCollection < Array
6
+ # @return [TrueClass, FalseClass]
7
+ attr_reader :fetched
8
+
9
+ # @return [Isobib::HitPages]
10
+ attr_reader :hit_pages
11
+
12
+ # @return [Gbbib::GbScrapper, Gbbib::SecScrapper, Gbbib::TScrapper]
13
+ attr_reader :scrapper
14
+
15
+ # @param hits [Array<Hash>]
16
+ # @param hit_pages [Integer]
17
+ # @param scrapper [Gbbib::GbScrapper, Gbbib::SecScrapper, Gbbib::TScrapper]
18
+ def initialize(hits = nil, hit_pages = nil)
19
+ concat hits
20
+ @fetched = false
21
+ @hit_pages = hit_pages
22
+ end
23
+
24
+ # @return [Isobib::HitCollection]
25
+ # def fetch
26
+ # workers = WorkersPool.new 4
27
+ # workers.worker(&:fetch)
28
+ # each do |hit|
29
+ # workers << hit
30
+ # end
31
+ # workers.end
32
+ # workers.result
33
+ # @fetched = true
34
+ # self
35
+ # end
36
+
37
+ # def to_s
38
+ # inspect
39
+ # end
40
+ #
41
+ # def inspect
42
+ # "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
43
+ # end
44
+ end
45
+ end
@@ -0,0 +1,159 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'yaml'
5
+
6
+ module Gbbib
7
+ # Common scrapping methods.
8
+ module Scrapper
9
+ @prefixes = nil
10
+
11
+ # rubocop:disable Metrics/MethodLength
12
+ # @param doc [Nokogiri::HTML::Document]
13
+ # @param src [String] url of scrapped page
14
+ # @return [Hash]
15
+ def scrapped_data(doc, src:)
16
+ {
17
+ committee: get_committee(doc),
18
+ docid: get_docid(doc),
19
+ titles: get_titles(doc),
20
+ type: get_type(doc),
21
+ docstatus: get_status(doc),
22
+ gbtype: get_gbtype(doc),
23
+ ccs: get_ccs(doc),
24
+ ics: get_ics(doc),
25
+ source: [{ type: 'src', content: src }],
26
+ dates: get_dates(doc),
27
+ language: ['zh'],
28
+ script: ['Hans']
29
+ }
30
+ end
31
+ # rubocop:enable Metrics/MethodLength
32
+
33
+ # @param doc [Nokogiri::HTML::Document]
34
+ # @return [Hash]
35
+ # * :project_number [String]
36
+ # * :part_number [String]
37
+ def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
38
+ item_ref = doc.xpath(xpt)
39
+ .text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
40
+ { project_number: item_ref[1], part_number: item_ref[2] }
41
+ end
42
+
43
+ # @param doc [Nokogiri::HTML::Document]
44
+ # @return [Array<Hash>]
45
+ # * :title_intro [String]
46
+ # * :title_main [String]
47
+ # * :language [String]
48
+ # * :script [String]
49
+ def get_titles(doc)
50
+ titles = [{ title_intro: doc.css('div.page-header h4').text,
51
+ title_main: '', language: 'zh', script: 'Hans' }]
52
+ title_intro = doc.css('div.page-header h5').text
53
+ unless title_intro.empty?
54
+ titles << { title_intro: title_intro, title_main: '', language: 'en',
55
+ script: 'Latn' }
56
+ end
57
+ titles
58
+ end
59
+
60
+ def get_type(_doc)
61
+ 'standard'
62
+ end
63
+
64
+ # @param doc [Nokogiri::HTML::Document]
65
+ # @return [Hash]
66
+ # * :status [String]
67
+ # * :stage [String]
68
+ # * :substage [String]
69
+ def get_status(doc, xpt = '.s-status.label:nth-child(3)')
70
+ status = case doc.at(xpt).text.gsub(/\s/, '')
71
+ when '即将实施' then 'published'
72
+ when '现行' then 'activated'
73
+ when '废止' then 'obsoleted'
74
+ end
75
+ { status: status, stage: '', substage: '' }
76
+ end
77
+
78
+ private
79
+
80
+ # @param doc [Nokogiri::HTML::Document]
81
+ # @return [Hash]
82
+ # * :scope [String]
83
+ # * :prefix [String]
84
+ # * :mandate [String]
85
+ def get_gbtype(doc)
86
+ ref = get_ref(doc)
87
+ { scope: get_scope(doc), prefix: get_prefix(ref)['prefix'],
88
+ mandate: get_mandate(ref) }
89
+ end
90
+
91
+ # @param doc [Nokogiri::HTML::Document]
92
+ # @return [String]
93
+ def get_ref(doc)
94
+ doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
95
+ end
96
+
97
+ # @param doc [Nokogiri::HTML::Document]
98
+ # @return [Array<String>]
99
+ def get_ccs(doc)
100
+ [doc.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]').text]
101
+ end
102
+
103
+ # @param doc [Nokogiri::HTML::Document]
104
+ # @return [Array<Hash>]
105
+ # * :field [String]
106
+ # * :group [String]
107
+ # * :subgroup [String]
108
+ def get_ics(doc)
109
+ ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
110
+ field, group, subgroup = ics.text.split '.'
111
+ [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
112
+ end
113
+
114
+ # @param doc [Nokogiri::HTML::Document]
115
+ # @return [String]
116
+ def get_scope(doc)
117
+ scope = doc.at('.s-status.label-info').text
118
+ if scope == '国家标准'
119
+ 'national'
120
+ elsif scope.match?(/^行业标准/)
121
+ 'sector'
122
+ end
123
+ end
124
+
125
+ # @param ref [String]
126
+ # @return [String]
127
+ def get_prefix(ref)
128
+ pref = ref.match(/^[^\s]+/).to_s.split('/').first
129
+ prefix pref
130
+ end
131
+
132
+ # @param pref [String]
133
+ # @return [Hash{String=>String}]
134
+ def prefix(pref)
135
+ file_path = File.join(__dir__, 'yaml/prefixes.yaml')
136
+ @prefixes ||= YAML.load_file(file_path)
137
+ @prefixes[pref]
138
+ end
139
+
140
+ # @param ref [String]
141
+ # @return [String]
142
+ def get_mandate(ref)
143
+ case ref.match(%r{(?<=\/)[^\s]+}).to_s
144
+ when 'T' then 'recommended'
145
+ when 'Z' then 'guidelines'
146
+ else 'mandatory'
147
+ end
148
+ end
149
+
150
+ # @param doc [Nokogiri::HTML::Document]
151
+ # @return [Array<Hash>]
152
+ # * :type [String] type of date
153
+ # * :on [String] date
154
+ def get_dates(doc)
155
+ date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
156
+ [{ type: 'published', on: date }]
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,51 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'net/http'
5
+ require 'json'
6
+ require 'nokogiri'
7
+ require 'gbbib/scrapper'
8
+ require 'gbbib/gb_bibliographic_item'
9
+ require 'gbbib/hit_collection'
10
+ require 'gbbib/hit'
11
+
12
+ module Gbbib
13
+ # Sector standard scrapper
14
+ module SecScrapper
15
+ extend Scrapper
16
+
17
+ class << self
18
+ # @param text [String] code of standard for serarch
19
+ # @return [Gbbib::HitCollection]
20
+ def scrape_page(text)
21
+ uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
+ res = JSON.parse Net::HTTP.get(uri)
23
+ hits = res['rows'].map do |r|
24
+ Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
25
+ end
26
+ HitCollection.new hits
27
+ end
28
+
29
+ # @param pid [String] standard's page id
30
+ # @return [Gbbib::GbBibliographicItem]
31
+ def scrape_doc(pid)
32
+ src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
33
+ page_uri = URI src
34
+ doc = Nokogiri::HTML Net::HTTP.get(page_uri)
35
+ GbBibliographicItem.new scrapped_data(doc, src: src)
36
+ end
37
+
38
+ private
39
+
40
+ # @param doc [Nokogiri::HTML::Document]
41
+ # @return [Hash]
42
+ # * :type [String]
43
+ # * :name [String]
44
+ def get_committee(doc)
45
+ ref = get_ref(doc)
46
+ name = get_prefix(ref)['administration']
47
+ { type: 'technical', name: name }
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,116 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'open-uri'
5
+ require 'nokogiri'
6
+ require 'gbbib/scrapper'
7
+ require 'gbbib/gb_bibliographic_item'
8
+ require 'gbbib/hit_collection'
9
+ require 'gbbib/hit'
10
+
11
+ module Gbbib
12
+ # Social standard scarpper.
13
+ module TScrapper
14
+ extend Scrapper
15
+
16
+ class << self
17
+ # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
18
+ # @param text [String]
19
+ # @return [Gbbib::HitCollection]
20
+ def scrape_page(text)
21
+ search_html = OpenURI.open_uri(
22
+ 'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
23
+ CGI.escape(text.tr('-', [8212].pack('U')))
24
+ )
25
+ header = Nokogiri::HTML search_html
26
+ xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
+ t_xpath = '../preceding-sibling::td[3]'
28
+ hits = header.xpath(xpath).map do |h|
29
+ title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
30
+ Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
31
+ end
32
+ HitCollection.new hits
33
+ end
34
+ # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
35
+
36
+ # @param pid [String] standard's page path
37
+ # @return [Gbbib::GbBibliographicItem]
38
+ def scrape_doc(pid)
39
+ src = "http://www.ttbz.org.cn#{pid}"
40
+ doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
41
+ GbBibliographicItem.new scrapped_data(doc, src: src)
42
+ end
43
+
44
+ private
45
+
46
+ # rubocop:disable Metrics/MethodLength
47
+ # @param doc [Nokogiri::HTML::Document]
48
+ # @return [Hash]
49
+ def scrapped_data(doc, src:)
50
+ docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
51
+ status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
52
+ {
53
+ committee: get_committee(doc),
54
+ docid: get_docid(doc, docid_xpt),
55
+ titles: get_titles(doc),
56
+ type: 'standard',
57
+ docstatus: get_status(doc, status_xpt),
58
+ gbtype: gbtype,
59
+ ccs: get_ccs(doc),
60
+ ics: get_ics(doc),
61
+ source: [{ type: 'src', content: src }],
62
+ dates: get_dates(doc),
63
+ language: ['zh'],
64
+ script: ['Hans']
65
+ }
66
+ end
67
+ # rubocop:enable Metrics/MethodLength
68
+
69
+ def get_committee(doc)
70
+ {
71
+ name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
72
+ type: 'technical'
73
+ }
74
+ end
75
+
76
+ def get_titles(doc)
77
+ xpath = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
78
+ titles = [{ title_intro: doc.xpath(xpath).text,
79
+ title_main: '', language: 'zh', script: 'Hans' }]
80
+ xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
81
+ title_intro = doc.xpath(xpath).text
82
+ unless title_intro.empty?
83
+ titles << { title_intro: title_intro, title_main: '', language: 'en',
84
+ script: 'Latn' }
85
+ end
86
+ titles
87
+ end
88
+
89
+ def gbtype
90
+ { scope: 'social-group', prefix: 'T', mandate: 'mandatory' }
91
+ end
92
+
93
+ # def get_group_code(ref)
94
+ # ref.match(%r{(?<=\/)[^\s]})
95
+ # end
96
+
97
+ def get_ccs(doc)
98
+ [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
99
+ .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
100
+ end
101
+
102
+ def get_ics(doc)
103
+ xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
104
+ ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
105
+ field, group, subgroup = ics.split '.'
106
+ [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
107
+ end
108
+
109
+ def get_dates(doc)
110
+ d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
111
+ .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
112
+ [{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,3 @@
1
+ module Gbbib
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,197 @@
1
+ # GbStandardNationalPrefix
2
+ GB:
3
+ prefix: GB_national
4
+ GBn:
5
+ prefix: GBn_confidential
6
+ GJB:
7
+ prefix: GJB_military
8
+ GSB:
9
+ prefix: GSB_physical
10
+
11
+ # GbStandardSectorPrefix
12
+ NY:
13
+ prefix: NY_agriculture
14
+ administration: 农业部
15
+ SC:
16
+ prefix: SC_aquatic
17
+ administration: 农业部
18
+ SL:
19
+ prefix: SL_water_resources
20
+ administration: 水利部
21
+ QB:
22
+ prefix: QB_light_industry
23
+ administration: 国家发改委
24
+ FZ:
25
+ prefix: FZ_textile
26
+ administration: 国家发改委
27
+ YY:
28
+ prefix: YY_medicine
29
+ administration: 国家食品药品监督管理局
30
+ MZ:
31
+ prefix: MZ_civil_affairs
32
+ administration: 民政部
33
+ JY:
34
+ prefix: JY_education
35
+ administration: 教育部
36
+ YC:
37
+ prefix: YC_tobacco
38
+ administration: 国家烟草专卖局
39
+ YB:
40
+ prefix: YB_ferrous_metallurgy
41
+ administration: 国家发改委
42
+ YS:
43
+ prefix: YS_nonferrous_metallurgy
44
+ administration: 国家发改委
45
+ SY:
46
+ prefix: SY_natural_gas
47
+ administration: 国家发改委
48
+ HG:
49
+ prefix: HG_chemical_industry
50
+ administration: 国家发改委
51
+ SH:
52
+ prefix: SH_petrochemical
53
+ administration: 国家发改委
54
+ JC:
55
+ prefix: JC_building_materials
56
+ administration: 国家发改委
57
+ DZ:
58
+ prefix: DZ_geological
59
+ administration: 国土资源部
60
+ TD:
61
+ prefix: TD_land_management
62
+ administration: 国土资源部
63
+ CH:
64
+ prefix: CH_surveying
65
+ administration: 国家测绘局
66
+ JB:
67
+ prefix: JB_mechanical
68
+ administration: 国家发改委
69
+ QC:
70
+ prefix: QC_automotile
71
+ administration: 国家发改委
72
+ MH:
73
+ prefix: MH_civil_aviation
74
+ administration: 中国民航管理总局
75
+ WJ:
76
+ prefix: WJ_ordnance
77
+ administration: 国防科学工业委员会
78
+ CB:
79
+ prefix: CB_ships
80
+ administration: 国防科学工业委员会
81
+ HB:
82
+ prefix: HB_aviation
83
+ administration: 国防科学工业委员会
84
+ QJ:
85
+ prefix: QJ_aerospace
86
+ administration: 国防科学工业委员会
87
+ EJ:
88
+ prefix: EJ_nuclear_industry
89
+ administration: 国防科学工业委员会
90
+ TB:
91
+ prefix: TB_rail_transport
92
+ administration: 铁道部
93
+ JT:
94
+ prefix: JT_traffic
95
+ administration: 交通部
96
+ LD:
97
+ prefix: LD_work_safety
98
+ administration: 劳动和社会保障部
99
+ SJ:
100
+ prefix: SJ_electronics
101
+ administration: 信息产业部
102
+ YD:
103
+ prefix: YD_communication
104
+ administration: 信息产业部
105
+ GY:
106
+ prefix: GY_media
107
+ administration: 国家广播电影电视总局
108
+ DL:
109
+ prefix: DL_electricity
110
+ administration: 国家发改委
111
+ JR:
112
+ prefix: JR_financial
113
+ administration: 中国人民银行
114
+ HY:
115
+ prefix: HY_oceanic
116
+ administration: 国家海洋局
117
+ DA:
118
+ prefix: DA_archiving
119
+ administration: 国家档案局
120
+ SN:
121
+ prefix: SN_inspection
122
+ administration: 国家质量监督检验检疫总局
123
+ WH:
124
+ prefix: WH_culture
125
+ administration: 文化部
126
+ TY:
127
+ prefix: TY_sports
128
+ administration: 国家体育总局
129
+ SB:
130
+ prefix: SB_business
131
+ administration: 商务部
132
+ WB:
133
+ prefix: WB_materials_management
134
+ administration: 国家发改委
135
+ HJ:
136
+ prefix: HJ_environment_protection
137
+ administration: 国家环境保护总局
138
+ XB:
139
+ prefix: XB_rare_earth
140
+ administration: 国家发改委稀土办公室
141
+ CJ:
142
+ prefix: CJ_town_construction
143
+ administration: 建设部
144
+ JG:
145
+ prefix: JG_construction_industry
146
+ administration: 建设部
147
+ CY:
148
+ prefix: CY_news_publishing
149
+ administration: 国家新闻出版总署
150
+ MT:
151
+ prefix: MT_coal
152
+ administration: 国家发改委
153
+ WS:
154
+ prefix: WS_health
155
+ administration: 卫生部
156
+ GA:
157
+ prefix: GA_public_safety
158
+ administration: 公安部
159
+ BB:
160
+ prefix: BB_packaging
161
+ administration: 国家发改委
162
+ DB:
163
+ prefix: DB_seismology
164
+ administration: 中国地震局
165
+ LB:
166
+ prefix: LB_tourism
167
+ administration: 国家旅游局
168
+ QX:
169
+ prefix: QX_meteorological
170
+ administration: 中国气象局
171
+ WM:
172
+ prefix: WM_foreign_trade
173
+ administration: 外经贸部科技司
174
+ HS:
175
+ prefix: HS_customs
176
+ administration: 海关总署
177
+ YZ:
178
+ prefix: YZ_postal
179
+ administration: 国家邮政局
180
+ GM:
181
+ prefix: GM_cryptography
182
+ administration: 国家密码管理局
183
+ AQ:
184
+ prefix: AQ_production_safety
185
+ administration: 国家安全生产管理局
186
+ GH:
187
+ prefix: GH_supply_marketing
188
+ administration: 中华全国供销合作总社
189
+ LS:
190
+ prefix: LS_food
191
+ administration: 国家粮食局
192
+ TJ:
193
+ prefix: TJ_railway_traffic
194
+ administration: 铁道部标准所
195
+ ZY:
196
+ prefix: ZY_chinese_medicine
197
+ administration: 国家中医药管理局
data/lib/gbbib.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'gbbib/version'
2
+ require 'gbbib/gb_bibliography'
metadata ADDED
@@ -0,0 +1,169 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gbbib
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ribose Inc.
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-06-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry-byebug
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: cnccs
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: iso-bib-item
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the
112
+ BibliographicItem model.'
113
+ email:
114
+ - pen.source@ribose.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - ".gitignore"
120
+ - ".rspec"
121
+ - ".rubocop.yml"
122
+ - ".travis.yml"
123
+ - Gemfile
124
+ - Gemfile.lock
125
+ - LICENSE.txt
126
+ - README.adoc
127
+ - Rakefile
128
+ - bin/console
129
+ - bin/setup
130
+ - gbbib.gemspec
131
+ - lib/gbbib.rb
132
+ - lib/gbbib/gb_bibliographic_item.rb
133
+ - lib/gbbib/gb_bibliography.rb
134
+ - lib/gbbib/gb_scrapper.rb
135
+ - lib/gbbib/gb_standard_type.rb
136
+ - lib/gbbib/gb_technical_committee.rb
137
+ - lib/gbbib/hit.rb
138
+ - lib/gbbib/hit_collection.rb
139
+ - lib/gbbib/scrapper.rb
140
+ - lib/gbbib/sec_scrapper.rb
141
+ - lib/gbbib/t_scrapper.rb
142
+ - lib/gbbib/version.rb
143
+ - lib/gbbib/yaml/prefixes.yaml
144
+ homepage: https://github.com/riboseinc/gdbib
145
+ licenses:
146
+ - MIT
147
+ metadata: {}
148
+ post_install_message:
149
+ rdoc_options: []
150
+ require_paths:
151
+ - lib
152
+ required_ruby_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ required_rubygems_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ requirements: []
163
+ rubyforge_project:
164
+ rubygems_version: 2.6.12
165
+ signing_key:
166
+ specification_version: 4
167
+ summary: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the BibliographicItem
168
+ model.'
169
+ test_files: []