relaton-gb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.hound.yml +3 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +17 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +25 -0
- data/README.adoc +202 -0
- data/Rakefile +6 -0
- data/appveyor.yml +35 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/relaton/processor.rb +25 -0
- data/lib/relaton_gb.rb +7 -0
- data/lib/relaton_gb/gb_bibliographic_item.rb +111 -0
- data/lib/relaton_gb/gb_bibliography.rb +137 -0
- data/lib/relaton_gb/gb_scrapper.rb +51 -0
- data/lib/relaton_gb/gb_standard_type.rb +30 -0
- data/lib/relaton_gb/gb_technical_committee.rb +23 -0
- data/lib/relaton_gb/hit.rb +60 -0
- data/lib/relaton_gb/hit_collection.rb +45 -0
- data/lib/relaton_gb/scrapper.rb +197 -0
- data/lib/relaton_gb/sec_scrapper.rb +57 -0
- data/lib/relaton_gb/t_scrapper.rb +121 -0
- data/lib/relaton_gb/version.rb +5 -0
- data/lib/relaton_gb/xml_parser.rb +49 -0
- data/lib/relaton_gb/yaml/prefixes.yaml +197 -0
- data/relaton_gb.gemspec +39 -0
- metadata +229 -0
data/bin/setup
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton/processor"
|
4
|
+
|
5
|
+
module Relaton
|
6
|
+
module RelatonGb
|
7
|
+
class Processor < Relaton::Processor
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@short = :relaton_gb
|
11
|
+
@prefix = "CN"
|
12
|
+
@defaultprefix = %r{^GB }
|
13
|
+
@idtype = "Chinese Standard"
|
14
|
+
end
|
15
|
+
|
16
|
+
def get(code, date, opts)
|
17
|
+
::RelatonGb::GbBibliography.get(code, date, opts)
|
18
|
+
end
|
19
|
+
|
20
|
+
def from_xml(xml)
|
21
|
+
::RelatonGb::XMLParser.from_xml xml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/relaton_gb.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_iso_bib"
|
4
|
+
require "cnccs"
|
5
|
+
require "relaton_gb/gb_technical_committee"
|
6
|
+
require "relaton_gb/gb_standard_type"
|
7
|
+
require "relaton_gb/xml_parser"
|
8
|
+
|
9
|
+
module RelatonGb
|
10
|
+
# GB bibliographic item class.
|
11
|
+
class GbBibliographicItem < RelatonIsoBib::IsoBibliographicItem
|
12
|
+
# @return [RelatonGb::GbTechnicalCommittee]
|
13
|
+
attr_reader :committee
|
14
|
+
|
15
|
+
# @return [RelatonGb::GbStandardType]
|
16
|
+
attr_reader :gbtype
|
17
|
+
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :topic
|
20
|
+
|
21
|
+
# @return [Array<Cnccs::Ccs>]
|
22
|
+
attr_reader :ccs
|
23
|
+
|
24
|
+
# @return [String]
|
25
|
+
attr_reader :plan_number
|
26
|
+
|
27
|
+
# @return [String]
|
28
|
+
attr_reader :type, :gbplannumber
|
29
|
+
|
30
|
+
def initialize(**args)
|
31
|
+
super
|
32
|
+
args[:committee] && @committee = GbTechnicalCommittee.new(args[:committee])
|
33
|
+
@ccs = args[:ccs].map { |c| Cnccs.fetch c }
|
34
|
+
@gbtype = GbStandardType.new args[:gbtype]
|
35
|
+
@type = args[:type]
|
36
|
+
@gbplannumber = args[:gbplannumber] || structuredidentifier.project_number
|
37
|
+
end
|
38
|
+
|
39
|
+
# @param builder [Nokogiri::XML::Builder]
|
40
|
+
# @return [String]
|
41
|
+
def to_xml(builder = nil, **opts)
|
42
|
+
if builder
|
43
|
+
super(builder, **opts) { |xml| render_gbxml(xml) }
|
44
|
+
else
|
45
|
+
Nokogiri::XML::Builder.new(encoding: "UTF-8") do |bldr|
|
46
|
+
super(bldr, **opts) { |xml| render_gbxml(xml) }
|
47
|
+
end.doc.root.to_xml
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [String]
|
52
|
+
def inspect
|
53
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)}>"
|
54
|
+
# "@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
55
|
+
# "@title=\"#{title}\">"
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [String]
|
59
|
+
def to_s
|
60
|
+
inspect
|
61
|
+
end
|
62
|
+
|
63
|
+
def makeid(id, attribute, _delim = "")
|
64
|
+
return nil if attribute && !@id_attribute
|
65
|
+
|
66
|
+
id ||= @docidentifier.reject { |i| i.type == "DOI" }[0]
|
67
|
+
idstr = id.id
|
68
|
+
# if id.part_number&.size&.positive?
|
69
|
+
# idstr = idstr + "-#{id.part_number}"
|
70
|
+
# end
|
71
|
+
idstr.gsub(/\s/, "").strip
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
# Overraides IsoBibliographicItem method.
|
77
|
+
# @param language [Array<String>]
|
78
|
+
# @raise ArgumentError
|
79
|
+
def check_language(language)
|
80
|
+
language.each do |lang|
|
81
|
+
unless %w[en zh].include? lang
|
82
|
+
raise ArgumentError, "invalid language: #{lang}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Overraides IsoBibliographicItem method.
|
88
|
+
# @param script [Array<String>]
|
89
|
+
# @raise ArgumentError
|
90
|
+
def check_script(script)
|
91
|
+
script.each do |scr|
|
92
|
+
raise ArgumentError, "invalid script: #{scr}" unless %w[Latn Hans].include? scr
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @param builder [Nokogiri::XML::Builder]
|
97
|
+
def render_gbxml(builder)
|
98
|
+
gbtype.to_xml builder
|
99
|
+
return unless ccs.any?
|
100
|
+
|
101
|
+
ccs.each do |c|
|
102
|
+
builder.ccs do
|
103
|
+
builder.code c.code
|
104
|
+
builder.text_ c.description
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
builder.gbplannumber gbplannumber if gbplannumber
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_gb/gb_bibliographic_item"
|
4
|
+
require "relaton_gb/gb_standard_type"
|
5
|
+
require "relaton_gb/hit_collection"
|
6
|
+
require "relaton_gb/hit"
|
7
|
+
|
8
|
+
# GB bib module.
|
9
|
+
module RelatonGb
|
10
|
+
# GB entry point class.
|
11
|
+
class GbBibliography
|
12
|
+
class << self
|
13
|
+
# rubocop:disable Metrics/MethodLength
|
14
|
+
# @param text [Strin] code of standard for search
|
15
|
+
# @return [RelatonGb::HitCollection]
|
16
|
+
def search(text)
|
17
|
+
if text =~ /^(GB|GJ|GS)/
|
18
|
+
# Scrape national standards.
|
19
|
+
require "relaton_gb/gb_scrapper"
|
20
|
+
GbScrapper.scrape_page text
|
21
|
+
elsif text =~ /^ZB/
|
22
|
+
# Scrape proffesional.
|
23
|
+
elsif text =~ /^DB/
|
24
|
+
# Scrape local standard.
|
25
|
+
elsif text =~ %r{^Q\/}
|
26
|
+
# Enterprise standard
|
27
|
+
elsif text =~ %r{^T\/[^\s]{3,6}\s}
|
28
|
+
# Scrape social standard.
|
29
|
+
require "relaton_gb/t_scrapper"
|
30
|
+
TScrapper.scrape_page text
|
31
|
+
else
|
32
|
+
# Scrape sector standard.
|
33
|
+
require "relaton_gb/sec_scrapper"
|
34
|
+
SecScrapper.scrape_page text
|
35
|
+
end
|
36
|
+
end
|
37
|
+
# rubocop:enable Metrics/MethodLength
|
38
|
+
|
39
|
+
# @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
|
40
|
+
# @param year [String] the year the standard was published (optional)
|
41
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
|
42
|
+
# @return [String] Relaton XML serialisation of reference
|
43
|
+
def get(code, year = nil, opts = {})
|
44
|
+
if year.nil?
|
45
|
+
/^(?<code1>[^-]+)-(?<year1>[^-]+)$/ =~ code
|
46
|
+
unless code1.nil?
|
47
|
+
code = code1
|
48
|
+
year = year1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
code += ".1" if opts[:all_parts]
|
53
|
+
code, year = code.split(/-/, 2) if /-/ =~ code
|
54
|
+
ret = get1(code, year, opts)
|
55
|
+
return nil if ret.nil?
|
56
|
+
|
57
|
+
ret.to_most_recent_reference unless year
|
58
|
+
ret.to_all_parts if opts[:all_parts]
|
59
|
+
ret
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def fetch_ref_err(code, year, missed_years)
|
65
|
+
id = year ? "#{code}:#{year}" : code
|
66
|
+
warn "WARNING: no match found on the GB website for #{id}. "\
|
67
|
+
"The code must be exactly like it is on the website."
|
68
|
+
warn "(There was no match for #{year}, though there were matches "\
|
69
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
70
|
+
if /\d-\d/ =~ code
|
71
|
+
warn "The provided document part may not exist, or the document "\
|
72
|
+
"may no longer be published in parts."
|
73
|
+
else
|
74
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
75
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
76
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
77
|
+
end
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
81
|
+
def get1(code, year, _opts)
|
82
|
+
# search must include year whenever available
|
83
|
+
searchcode = code + (year.nil? ? "" : "-#{year}")
|
84
|
+
result = search_filter(searchcode) || return
|
85
|
+
ret = results_filter(result, year)
|
86
|
+
return ret[:ret] if ret[:ret]
|
87
|
+
|
88
|
+
fetch_ref_err(code, year, ret[:years])
|
89
|
+
end
|
90
|
+
|
91
|
+
def search_filter(code)
|
92
|
+
# search filter needs to incorporate year
|
93
|
+
docidrx = %r{^[^\s]+\s[\d\.-]+}
|
94
|
+
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
95
|
+
warn "fetching #{code}..."
|
96
|
+
result = search(code)
|
97
|
+
ret = result.select do |hit|
|
98
|
+
hit.title && hit.title.match(docidrx).to_s == code # &&
|
99
|
+
# !corrigrx =~ hit.title
|
100
|
+
end
|
101
|
+
return ret unless ret.empty?
|
102
|
+
|
103
|
+
[]
|
104
|
+
end
|
105
|
+
|
106
|
+
# Sort through the results from Isobib, fetching them three at a time,
|
107
|
+
# and return the first result that matches the code,
|
108
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
109
|
+
# Only expects the first page of results to be populated.
|
110
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
111
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
112
|
+
def results_filter(result, year)
|
113
|
+
missed_years = []
|
114
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
115
|
+
fetch_pages(s, 3).each_with_index do |r, i|
|
116
|
+
return { ret: r } if !year
|
117
|
+
|
118
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
119
|
+
return { ret: r } if year.to_i == d.on.year
|
120
|
+
|
121
|
+
missed_years << d.on.year
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
{ years: missed_years }
|
126
|
+
end
|
127
|
+
|
128
|
+
def fetch_pages(s, n)
|
129
|
+
workers = RelatonBib::WorkersPool.new n
|
130
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
131
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
132
|
+
workers.end
|
133
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "open-uri"
|
5
|
+
require "nokogiri"
|
6
|
+
require "relaton_gb/scrapper"
|
7
|
+
require "relaton_gb/gb_bibliographic_item"
|
8
|
+
|
9
|
+
module RelatonGb
|
10
|
+
# National standard scrapper.
|
11
|
+
module GbScrapper
|
12
|
+
extend Scrapper
|
13
|
+
|
14
|
+
class << self
|
15
|
+
# @param text [Strin] code of standard for serarch
|
16
|
+
# @return [RelatonGb::HitCollection]
|
17
|
+
def scrape_page(text)
|
18
|
+
search_html = OpenURI.open_uri(
|
19
|
+
"http://www.std.gov.cn/search/stdPage?q=" + text
|
20
|
+
)
|
21
|
+
result = Nokogiri::HTML search_html
|
22
|
+
hits = result.css(".s-title a").map do |h|
|
23
|
+
Hit.new pid: h[:pid], title: h.text, scrapper: self
|
24
|
+
end
|
25
|
+
HitCollection.new hits
|
26
|
+
rescue OpenURI::HTTPError, SocketError
|
27
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param pid [Strin] standard's page id
|
31
|
+
# @return [RelatonGb::GbBibliographicItem]
|
32
|
+
def scrape_doc(pid)
|
33
|
+
src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
|
34
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
35
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
36
|
+
rescue OpenURI::HTTPError, SocketError
|
37
|
+
warn "Cannot access http://www.std.gov.cn/gb/search/gbDetailed"
|
38
|
+
end
|
39
|
+
|
40
|
+
# @param doc [Nokogiri::HTML]
|
41
|
+
# @return [Hash]
|
42
|
+
# * :type [String]
|
43
|
+
# * :name [String]
|
44
|
+
def get_committee(doc)
|
45
|
+
name = doc.xpath("//p/a[1]/following-sibling::text()").text.
|
46
|
+
match(/(?<=()[^)]+/).to_s
|
47
|
+
{ type: "technical", name: name }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module RelatonGb
|
2
|
+
# GB standard type.
|
3
|
+
class GbStandardType
|
4
|
+
# @return [String]
|
5
|
+
attr_reader :scope
|
6
|
+
|
7
|
+
# @return [String]
|
8
|
+
attr_reader :prefix
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
attr_reader :mandate
|
12
|
+
|
13
|
+
# @param scope [String]
|
14
|
+
# @param prefix [String]
|
15
|
+
# @param mandate [String]
|
16
|
+
def initialize(scope:, prefix:, mandate:)
|
17
|
+
@scope = scope
|
18
|
+
@prefix = prefix
|
19
|
+
@mandate = mandate
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_xml(builder)
|
23
|
+
builder.gbtype do
|
24
|
+
builder.gbscope @scope
|
25
|
+
builder.gbprefix @prefix
|
26
|
+
builder.gbmandate @mandate
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module RelatonGb
|
2
|
+
# GB technical committee.
|
3
|
+
class GbTechnicalCommittee
|
4
|
+
# @return [String]
|
5
|
+
attr_reader :type
|
6
|
+
|
7
|
+
# @return [String]
|
8
|
+
attr_reader :name
|
9
|
+
|
10
|
+
# @param type [String]
|
11
|
+
# @param name [String]
|
12
|
+
def initialize(type:, name:)
|
13
|
+
@type = type
|
14
|
+
@name = name
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_xml(builder)
|
18
|
+
builder.gbcommittee(type: @type) do
|
19
|
+
builder.text @name
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RelatonGb
|
4
|
+
# Hit.
|
5
|
+
class Hit
|
6
|
+
# @return [Isobib::HitCollection]
|
7
|
+
attr_reader :hit_collection
|
8
|
+
|
9
|
+
# @return [String]
|
10
|
+
attr_reader :pid
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :title
|
14
|
+
|
15
|
+
# @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
16
|
+
attr_reader :scrapper
|
17
|
+
|
18
|
+
# @param hit [Hash]
|
19
|
+
# @param hit_collection [Isobib:HitCollection]
|
20
|
+
def initialize(pid:, title:, hit_collection: nil, scrapper:)
|
21
|
+
@pid = pid
|
22
|
+
@title = title
|
23
|
+
@hit_collection = hit_collection
|
24
|
+
@scrapper = scrapper
|
25
|
+
self.hit_collection << self if hit_collection
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse page.
|
29
|
+
# @return [Isobib::IsoBibliographicItem]
|
30
|
+
def fetch
|
31
|
+
@fetch ||= scrapper.scrape_doc pid
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [String]
|
35
|
+
def to_s
|
36
|
+
inspect
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [String]
|
40
|
+
def inspect
|
41
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
42
|
+
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
43
|
+
"@title=\"#{title}\">"
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param builder [Nokogiri::XML::Builder]
|
47
|
+
# @param opts [Hash]
|
48
|
+
# @return [String]
|
49
|
+
# def to_xml(builder = nil, opts = {})
|
50
|
+
# if builder
|
51
|
+
# fetch.to_xml builder, opts
|
52
|
+
# else
|
53
|
+
# builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
54
|
+
# fetch.to_xml xml, opts
|
55
|
+
# end
|
56
|
+
# builder.doc.root.to_xml
|
57
|
+
# end
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
end
|