relaton-gb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.hound.yml +3 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +17 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +25 -0
- data/README.adoc +202 -0
- data/Rakefile +6 -0
- data/appveyor.yml +35 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/relaton/processor.rb +25 -0
- data/lib/relaton_gb.rb +7 -0
- data/lib/relaton_gb/gb_bibliographic_item.rb +111 -0
- data/lib/relaton_gb/gb_bibliography.rb +137 -0
- data/lib/relaton_gb/gb_scrapper.rb +51 -0
- data/lib/relaton_gb/gb_standard_type.rb +30 -0
- data/lib/relaton_gb/gb_technical_committee.rb +23 -0
- data/lib/relaton_gb/hit.rb +60 -0
- data/lib/relaton_gb/hit_collection.rb +45 -0
- data/lib/relaton_gb/scrapper.rb +197 -0
- data/lib/relaton_gb/sec_scrapper.rb +57 -0
- data/lib/relaton_gb/t_scrapper.rb +121 -0
- data/lib/relaton_gb/version.rb +5 -0
- data/lib/relaton_gb/xml_parser.rb +49 -0
- data/lib/relaton_gb/yaml/prefixes.yaml +197 -0
- data/relaton_gb.gemspec +39 -0
- metadata +229 -0
data/bin/setup
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton/processor"
|
4
|
+
|
5
|
+
module Relaton
|
6
|
+
module RelatonGb
|
7
|
+
class Processor < Relaton::Processor
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@short = :relaton_gb
|
11
|
+
@prefix = "CN"
|
12
|
+
@defaultprefix = %r{^GB }
|
13
|
+
@idtype = "Chinese Standard"
|
14
|
+
end
|
15
|
+
|
16
|
+
def get(code, date, opts)
|
17
|
+
::RelatonGb::GbBibliography.get(code, date, opts)
|
18
|
+
end
|
19
|
+
|
20
|
+
def from_xml(xml)
|
21
|
+
::RelatonGb::XMLParser.from_xml xml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/relaton_gb.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_iso_bib"
|
4
|
+
require "cnccs"
|
5
|
+
require "relaton_gb/gb_technical_committee"
|
6
|
+
require "relaton_gb/gb_standard_type"
|
7
|
+
require "relaton_gb/xml_parser"
|
8
|
+
|
9
|
+
module RelatonGb
|
10
|
+
# GB bibliographic item class.
|
11
|
+
class GbBibliographicItem < RelatonIsoBib::IsoBibliographicItem
|
12
|
+
# @return [RelatonGb::GbTechnicalCommittee]
|
13
|
+
attr_reader :committee
|
14
|
+
|
15
|
+
# @return [RelatonGb::GbStandardType]
|
16
|
+
attr_reader :gbtype
|
17
|
+
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :topic
|
20
|
+
|
21
|
+
# @return [Array<Cnccs::Ccs>]
|
22
|
+
attr_reader :ccs
|
23
|
+
|
24
|
+
# @return [String]
|
25
|
+
attr_reader :plan_number
|
26
|
+
|
27
|
+
# @return [String]
|
28
|
+
attr_reader :type, :gbplannumber
|
29
|
+
|
30
|
+
def initialize(**args)
|
31
|
+
super
|
32
|
+
args[:committee] && @committee = GbTechnicalCommittee.new(args[:committee])
|
33
|
+
@ccs = args[:ccs].map { |c| Cnccs.fetch c }
|
34
|
+
@gbtype = GbStandardType.new args[:gbtype]
|
35
|
+
@type = args[:type]
|
36
|
+
@gbplannumber = args[:gbplannumber] || structuredidentifier.project_number
|
37
|
+
end
|
38
|
+
|
39
|
+
# @param builder [Nokogiri::XML::Builder]
|
40
|
+
# @return [String]
|
41
|
+
def to_xml(builder = nil, **opts)
|
42
|
+
if builder
|
43
|
+
super(builder, **opts) { |xml| render_gbxml(xml) }
|
44
|
+
else
|
45
|
+
Nokogiri::XML::Builder.new(encoding: "UTF-8") do |bldr|
|
46
|
+
super(bldr, **opts) { |xml| render_gbxml(xml) }
|
47
|
+
end.doc.root.to_xml
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [String]
|
52
|
+
def inspect
|
53
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)}>"
|
54
|
+
# "@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
55
|
+
# "@title=\"#{title}\">"
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [String]
|
59
|
+
def to_s
|
60
|
+
inspect
|
61
|
+
end
|
62
|
+
|
63
|
+
def makeid(id, attribute, _delim = "")
|
64
|
+
return nil if attribute && !@id_attribute
|
65
|
+
|
66
|
+
id ||= @docidentifier.reject { |i| i.type == "DOI" }[0]
|
67
|
+
idstr = id.id
|
68
|
+
# if id.part_number&.size&.positive?
|
69
|
+
# idstr = idstr + "-#{id.part_number}"
|
70
|
+
# end
|
71
|
+
idstr.gsub(/\s/, "").strip
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
# Overraides IsoBibliographicItem method.
|
77
|
+
# @param language [Array<String>]
|
78
|
+
# @raise ArgumentError
|
79
|
+
def check_language(language)
|
80
|
+
language.each do |lang|
|
81
|
+
unless %w[en zh].include? lang
|
82
|
+
raise ArgumentError, "invalid language: #{lang}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Overraides IsoBibliographicItem method.
|
88
|
+
# @param script [Array<String>]
|
89
|
+
# @raise ArgumentError
|
90
|
+
def check_script(script)
|
91
|
+
script.each do |scr|
|
92
|
+
raise ArgumentError, "invalid script: #{scr}" unless %w[Latn Hans].include? scr
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @param builder [Nokogiri::XML::Builder]
|
97
|
+
def render_gbxml(builder)
|
98
|
+
gbtype.to_xml builder
|
99
|
+
return unless ccs.any?
|
100
|
+
|
101
|
+
ccs.each do |c|
|
102
|
+
builder.ccs do
|
103
|
+
builder.code c.code
|
104
|
+
builder.text_ c.description
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
builder.gbplannumber gbplannumber if gbplannumber
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_gb/gb_bibliographic_item"
|
4
|
+
require "relaton_gb/gb_standard_type"
|
5
|
+
require "relaton_gb/hit_collection"
|
6
|
+
require "relaton_gb/hit"
|
7
|
+
|
8
|
+
# GB bib module.
|
9
|
+
module RelatonGb
|
10
|
+
# GB entry point class.
|
11
|
+
class GbBibliography
|
12
|
+
class << self
|
13
|
+
# rubocop:disable Metrics/MethodLength
|
14
|
+
# @param text [Strin] code of standard for search
|
15
|
+
# @return [RelatonGb::HitCollection]
|
16
|
+
def search(text)
|
17
|
+
if text =~ /^(GB|GJ|GS)/
|
18
|
+
# Scrape national standards.
|
19
|
+
require "relaton_gb/gb_scrapper"
|
20
|
+
GbScrapper.scrape_page text
|
21
|
+
elsif text =~ /^ZB/
|
22
|
+
# Scrape proffesional.
|
23
|
+
elsif text =~ /^DB/
|
24
|
+
# Scrape local standard.
|
25
|
+
elsif text =~ %r{^Q\/}
|
26
|
+
# Enterprise standard
|
27
|
+
elsif text =~ %r{^T\/[^\s]{3,6}\s}
|
28
|
+
# Scrape social standard.
|
29
|
+
require "relaton_gb/t_scrapper"
|
30
|
+
TScrapper.scrape_page text
|
31
|
+
else
|
32
|
+
# Scrape sector standard.
|
33
|
+
require "relaton_gb/sec_scrapper"
|
34
|
+
SecScrapper.scrape_page text
|
35
|
+
end
|
36
|
+
end
|
37
|
+
# rubocop:enable Metrics/MethodLength
|
38
|
+
|
39
|
+
# @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
|
40
|
+
# @param year [String] the year the standard was published (optional)
|
41
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
|
42
|
+
# @return [String] Relaton XML serialisation of reference
|
43
|
+
def get(code, year = nil, opts = {})
|
44
|
+
if year.nil?
|
45
|
+
/^(?<code1>[^-]+)-(?<year1>[^-]+)$/ =~ code
|
46
|
+
unless code1.nil?
|
47
|
+
code = code1
|
48
|
+
year = year1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
code += ".1" if opts[:all_parts]
|
53
|
+
code, year = code.split(/-/, 2) if /-/ =~ code
|
54
|
+
ret = get1(code, year, opts)
|
55
|
+
return nil if ret.nil?
|
56
|
+
|
57
|
+
ret.to_most_recent_reference unless year
|
58
|
+
ret.to_all_parts if opts[:all_parts]
|
59
|
+
ret
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def fetch_ref_err(code, year, missed_years)
|
65
|
+
id = year ? "#{code}:#{year}" : code
|
66
|
+
warn "WARNING: no match found on the GB website for #{id}. "\
|
67
|
+
"The code must be exactly like it is on the website."
|
68
|
+
warn "(There was no match for #{year}, though there were matches "\
|
69
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
70
|
+
if /\d-\d/ =~ code
|
71
|
+
warn "The provided document part may not exist, or the document "\
|
72
|
+
"may no longer be published in parts."
|
73
|
+
else
|
74
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
75
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
76
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
77
|
+
end
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
81
|
+
def get1(code, year, _opts)
|
82
|
+
# search must include year whenever available
|
83
|
+
searchcode = code + (year.nil? ? "" : "-#{year}")
|
84
|
+
result = search_filter(searchcode) || return
|
85
|
+
ret = results_filter(result, year)
|
86
|
+
return ret[:ret] if ret[:ret]
|
87
|
+
|
88
|
+
fetch_ref_err(code, year, ret[:years])
|
89
|
+
end
|
90
|
+
|
91
|
+
def search_filter(code)
|
92
|
+
# search filter needs to incorporate year
|
93
|
+
docidrx = %r{^[^\s]+\s[\d\.-]+}
|
94
|
+
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
95
|
+
warn "fetching #{code}..."
|
96
|
+
result = search(code)
|
97
|
+
ret = result.select do |hit|
|
98
|
+
hit.title && hit.title.match(docidrx).to_s == code # &&
|
99
|
+
# !corrigrx =~ hit.title
|
100
|
+
end
|
101
|
+
return ret unless ret.empty?
|
102
|
+
|
103
|
+
[]
|
104
|
+
end
|
105
|
+
|
106
|
+
# Sort through the results from Isobib, fetching them three at a time,
|
107
|
+
# and return the first result that matches the code,
|
108
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
109
|
+
# Only expects the first page of results to be populated.
|
110
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
111
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
112
|
+
def results_filter(result, year)
|
113
|
+
missed_years = []
|
114
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
115
|
+
fetch_pages(s, 3).each_with_index do |r, i|
|
116
|
+
return { ret: r } if !year
|
117
|
+
|
118
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
119
|
+
return { ret: r } if year.to_i == d.on.year
|
120
|
+
|
121
|
+
missed_years << d.on.year
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
{ years: missed_years }
|
126
|
+
end
|
127
|
+
|
128
|
+
def fetch_pages(s, n)
|
129
|
+
workers = RelatonBib::WorkersPool.new n
|
130
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
131
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
132
|
+
workers.end
|
133
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "open-uri"
|
5
|
+
require "nokogiri"
|
6
|
+
require "relaton_gb/scrapper"
|
7
|
+
require "relaton_gb/gb_bibliographic_item"
|
8
|
+
|
9
|
+
module RelatonGb
|
10
|
+
# National standard scrapper.
|
11
|
+
module GbScrapper
|
12
|
+
extend Scrapper
|
13
|
+
|
14
|
+
class << self
|
15
|
+
# @param text [Strin] code of standard for serarch
|
16
|
+
# @return [RelatonGb::HitCollection]
|
17
|
+
def scrape_page(text)
|
18
|
+
search_html = OpenURI.open_uri(
|
19
|
+
"http://www.std.gov.cn/search/stdPage?q=" + text
|
20
|
+
)
|
21
|
+
result = Nokogiri::HTML search_html
|
22
|
+
hits = result.css(".s-title a").map do |h|
|
23
|
+
Hit.new pid: h[:pid], title: h.text, scrapper: self
|
24
|
+
end
|
25
|
+
HitCollection.new hits
|
26
|
+
rescue OpenURI::HTTPError, SocketError
|
27
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param pid [Strin] standard's page id
|
31
|
+
# @return [RelatonGb::GbBibliographicItem]
|
32
|
+
def scrape_doc(pid)
|
33
|
+
src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
|
34
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
35
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
36
|
+
rescue OpenURI::HTTPError, SocketError
|
37
|
+
warn "Cannot access http://www.std.gov.cn/gb/search/gbDetailed"
|
38
|
+
end
|
39
|
+
|
40
|
+
# @param doc [Nokogiri::HTML]
|
41
|
+
# @return [Hash]
|
42
|
+
# * :type [String]
|
43
|
+
# * :name [String]
|
44
|
+
def get_committee(doc)
|
45
|
+
name = doc.xpath("//p/a[1]/following-sibling::text()").text.
|
46
|
+
match(/(?<=()[^)]+/).to_s
|
47
|
+
{ type: "technical", name: name }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module RelatonGb
|
2
|
+
# GB standard type.
|
3
|
+
class GbStandardType
|
4
|
+
# @return [String]
|
5
|
+
attr_reader :scope
|
6
|
+
|
7
|
+
# @return [String]
|
8
|
+
attr_reader :prefix
|
9
|
+
|
10
|
+
# @return [String]
|
11
|
+
attr_reader :mandate
|
12
|
+
|
13
|
+
# @param scope [String]
|
14
|
+
# @param prefix [String]
|
15
|
+
# @param mandate [String]
|
16
|
+
def initialize(scope:, prefix:, mandate:)
|
17
|
+
@scope = scope
|
18
|
+
@prefix = prefix
|
19
|
+
@mandate = mandate
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_xml(builder)
|
23
|
+
builder.gbtype do
|
24
|
+
builder.gbscope @scope
|
25
|
+
builder.gbprefix @prefix
|
26
|
+
builder.gbmandate @mandate
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module RelatonGb
|
2
|
+
# GB technical committee.
|
3
|
+
class GbTechnicalCommittee
|
4
|
+
# @return [String]
|
5
|
+
attr_reader :type
|
6
|
+
|
7
|
+
# @return [String]
|
8
|
+
attr_reader :name
|
9
|
+
|
10
|
+
# @param type [String]
|
11
|
+
# @param name [String]
|
12
|
+
def initialize(type:, name:)
|
13
|
+
@type = type
|
14
|
+
@name = name
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_xml(builder)
|
18
|
+
builder.gbcommittee(type: @type) do
|
19
|
+
builder.text @name
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RelatonGb
|
4
|
+
# Hit.
|
5
|
+
class Hit
|
6
|
+
# @return [Isobib::HitCollection]
|
7
|
+
attr_reader :hit_collection
|
8
|
+
|
9
|
+
# @return [String]
|
10
|
+
attr_reader :pid
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :title
|
14
|
+
|
15
|
+
# @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
16
|
+
attr_reader :scrapper
|
17
|
+
|
18
|
+
# @param hit [Hash]
|
19
|
+
# @param hit_collection [Isobib:HitCollection]
|
20
|
+
def initialize(pid:, title:, hit_collection: nil, scrapper:)
|
21
|
+
@pid = pid
|
22
|
+
@title = title
|
23
|
+
@hit_collection = hit_collection
|
24
|
+
@scrapper = scrapper
|
25
|
+
self.hit_collection << self if hit_collection
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse page.
|
29
|
+
# @return [Isobib::IsoBibliographicItem]
|
30
|
+
def fetch
|
31
|
+
@fetch ||= scrapper.scrape_doc pid
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [String]
|
35
|
+
def to_s
|
36
|
+
inspect
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [String]
|
40
|
+
def inspect
|
41
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
42
|
+
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
43
|
+
"@title=\"#{title}\">"
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param builder [Nokogiri::XML::Builder]
|
47
|
+
# @param opts [Hash]
|
48
|
+
# @return [String]
|
49
|
+
# def to_xml(builder = nil, opts = {})
|
50
|
+
# if builder
|
51
|
+
# fetch.to_xml builder, opts
|
52
|
+
# else
|
53
|
+
# builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
54
|
+
# fetch.to_xml xml, opts
|
55
|
+
# end
|
56
|
+
# builder.doc.root.to_xml
|
57
|
+
# end
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
end
|