relaton-gb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton/processor"
4
+
5
+ module Relaton
6
+ module RelatonGb
7
+ class Processor < Relaton::Processor
8
+
9
+ def initialize
10
+ @short = :relaton_gb
11
+ @prefix = "CN"
12
+ @defaultprefix = %r{^GB }
13
+ @idtype = "Chinese Standard"
14
+ end
15
+
16
+ def get(code, date, opts)
17
+ ::RelatonGb::GbBibliography.get(code, date, opts)
18
+ end
19
+
20
+ def from_xml(xml)
21
+ ::RelatonGb::XMLParser.from_xml xml
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,7 @@
1
+ require "relaton_gb/version"
2
+ require "relaton_gb/gb_bibliography"
3
+
4
+ if defined? Relaton
5
+ require_relative "relaton/processor"
6
+ Relaton::Registry.instance.register Relaton::RelatoGb::Processor
7
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso_bib"
4
+ require "cnccs"
5
+ require "relaton_gb/gb_technical_committee"
6
+ require "relaton_gb/gb_standard_type"
7
+ require "relaton_gb/xml_parser"
8
+
9
+ module RelatonGb
10
+ # GB bibliographic item class.
11
+ class GbBibliographicItem < RelatonIsoBib::IsoBibliographicItem
12
+ # @return [RelatonGb::GbTechnicalCommittee]
13
+ attr_reader :committee
14
+
15
+ # @return [RelatonGb::GbStandardType]
16
+ attr_reader :gbtype
17
+
18
+ # @return [String]
19
+ attr_reader :topic
20
+
21
+ # @return [Array<Cnccs::Ccs>]
22
+ attr_reader :ccs
23
+
24
+ # @return [String]
25
+ attr_reader :plan_number
26
+
27
+ # @return [String]
28
+ attr_reader :type, :gbplannumber
29
+
30
+ def initialize(**args)
31
+ super
32
+ args[:committee] && @committee = GbTechnicalCommittee.new(args[:committee])
33
+ @ccs = args[:ccs].map { |c| Cnccs.fetch c }
34
+ @gbtype = GbStandardType.new args[:gbtype]
35
+ @type = args[:type]
36
+ @gbplannumber = args[:gbplannumber] || structuredidentifier.project_number
37
+ end
38
+
39
+ # @param builder [Nokogiri::XML::Builder]
40
+ # @return [String]
41
+ def to_xml(builder = nil, **opts)
42
+ if builder
43
+ super(builder, **opts) { |xml| render_gbxml(xml) }
44
+ else
45
+ Nokogiri::XML::Builder.new(encoding: "UTF-8") do |bldr|
46
+ super(bldr, **opts) { |xml| render_gbxml(xml) }
47
+ end.doc.root.to_xml
48
+ end
49
+ end
50
+
51
+ # @return [String]
52
+ def inspect
53
+ "<#{self.class}:#{format('%#.14x', object_id << 1)}>"
54
+ # "@fullIdentifier=\"#{@fetch&.shortref}\" "\
55
+ # "@title=\"#{title}\">"
56
+ end
57
+
58
+ # @return [String]
59
+ def to_s
60
+ inspect
61
+ end
62
+
63
+ def makeid(id, attribute, _delim = "")
64
+ return nil if attribute && !@id_attribute
65
+
66
+ id ||= @docidentifier.reject { |i| i.type == "DOI" }[0]
67
+ idstr = id.id
68
+ # if id.part_number&.size&.positive?
69
+ # idstr = idstr + "-#{id.part_number}"
70
+ # end
71
+ idstr.gsub(/\s/, "").strip
72
+ end
73
+
74
+ private
75
+
76
+ # Overraides IsoBibliographicItem method.
77
+ # @param language [Array<String>]
78
+ # @raise ArgumentError
79
+ def check_language(language)
80
+ language.each do |lang|
81
+ unless %w[en zh].include? lang
82
+ raise ArgumentError, "invalid language: #{lang}"
83
+ end
84
+ end
85
+ end
86
+
87
+ # Overraides IsoBibliographicItem method.
88
+ # @param script [Array<String>]
89
+ # @raise ArgumentError
90
+ def check_script(script)
91
+ script.each do |scr|
92
+ raise ArgumentError, "invalid script: #{scr}" unless %w[Latn Hans].include? scr
93
+ end
94
+ end
95
+
96
+ # @param builder [Nokogiri::XML::Builder]
97
+ def render_gbxml(builder)
98
+ gbtype.to_xml builder
99
+ return unless ccs.any?
100
+
101
+ ccs.each do |c|
102
+ builder.ccs do
103
+ builder.code c.code
104
+ builder.text_ c.description
105
+ end
106
+ end
107
+
108
+ builder.gbplannumber gbplannumber if gbplannumber
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_gb/gb_bibliographic_item"
4
+ require "relaton_gb/gb_standard_type"
5
+ require "relaton_gb/hit_collection"
6
+ require "relaton_gb/hit"
7
+
8
+ # GB bib module.
9
+ module RelatonGb
10
+ # GB entry point class.
11
+ class GbBibliography
12
+ class << self
13
+ # rubocop:disable Metrics/MethodLength
14
+ # @param text [Strin] code of standard for search
15
+ # @return [RelatonGb::HitCollection]
16
+ def search(text)
17
+ if text =~ /^(GB|GJ|GS)/
18
+ # Scrape national standards.
19
+ require "relaton_gb/gb_scrapper"
20
+ GbScrapper.scrape_page text
21
+ elsif text =~ /^ZB/
22
+ # Scrape proffesional.
23
+ elsif text =~ /^DB/
24
+ # Scrape local standard.
25
+ elsif text =~ %r{^Q\/}
26
+ # Enterprise standard
27
+ elsif text =~ %r{^T\/[^\s]{3,6}\s}
28
+ # Scrape social standard.
29
+ require "relaton_gb/t_scrapper"
30
+ TScrapper.scrape_page text
31
+ else
32
+ # Scrape sector standard.
33
+ require "relaton_gb/sec_scrapper"
34
+ SecScrapper.scrape_page text
35
+ end
36
+ end
37
+ # rubocop:enable Metrics/MethodLength
38
+
39
+ # @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
40
+ # @param year [String] the year the standard was published (optional)
41
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
42
+ # @return [String] Relaton XML serialisation of reference
43
+ def get(code, year = nil, opts = {})
44
+ if year.nil?
45
+ /^(?<code1>[^-]+)-(?<year1>[^-]+)$/ =~ code
46
+ unless code1.nil?
47
+ code = code1
48
+ year = year1
49
+ end
50
+ end
51
+
52
+ code += ".1" if opts[:all_parts]
53
+ code, year = code.split(/-/, 2) if /-/ =~ code
54
+ ret = get1(code, year, opts)
55
+ return nil if ret.nil?
56
+
57
+ ret.to_most_recent_reference unless year
58
+ ret.to_all_parts if opts[:all_parts]
59
+ ret
60
+ end
61
+
62
+ private
63
+
64
+ def fetch_ref_err(code, year, missed_years)
65
+ id = year ? "#{code}:#{year}" : code
66
+ warn "WARNING: no match found on the GB website for #{id}. "\
67
+ "The code must be exactly like it is on the website."
68
+ warn "(There was no match for #{year}, though there were matches "\
69
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
70
+ if /\d-\d/ =~ code
71
+ warn "The provided document part may not exist, or the document "\
72
+ "may no longer be published in parts."
73
+ else
74
+ warn "If you wanted to cite all document parts for the reference, "\
75
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
76
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
77
+ end
78
+ nil
79
+ end
80
+
81
+ def get1(code, year, _opts)
82
+ # search must include year whenever available
83
+ searchcode = code + (year.nil? ? "" : "-#{year}")
84
+ result = search_filter(searchcode) || return
85
+ ret = results_filter(result, year)
86
+ return ret[:ret] if ret[:ret]
87
+
88
+ fetch_ref_err(code, year, ret[:years])
89
+ end
90
+
91
+ def search_filter(code)
92
+ # search filter needs to incorporate year
93
+ docidrx = %r{^[^\s]+\s[\d\.-]+}
94
+ # corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
95
+ warn "fetching #{code}..."
96
+ result = search(code)
97
+ ret = result.select do |hit|
98
+ hit.title && hit.title.match(docidrx).to_s == code # &&
99
+ # !corrigrx =~ hit.title
100
+ end
101
+ return ret unless ret.empty?
102
+
103
+ []
104
+ end
105
+
106
+ # Sort through the results from Isobib, fetching them three at a time,
107
+ # and return the first result that matches the code,
108
+ # matches the year (if provided), and which # has a title (amendments do not).
109
+ # Only expects the first page of results to be populated.
110
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
111
+ # If no match, returns any years which caused mismatch, for error reporting
112
+ def results_filter(result, year)
113
+ missed_years = []
114
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
115
+ fetch_pages(s, 3).each_with_index do |r, i|
116
+ return { ret: r } if !year
117
+
118
+ r.dates.select { |d| d.type == "published" }.each do |d|
119
+ return { ret: r } if year.to_i == d.on.year
120
+
121
+ missed_years << d.on.year
122
+ end
123
+ end
124
+ end
125
+ { years: missed_years }
126
+ end
127
+
128
+ def fetch_pages(s, n)
129
+ workers = RelatonBib::WorkersPool.new n
130
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
131
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
132
+ workers.end
133
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,51 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "open-uri"
5
+ require "nokogiri"
6
+ require "relaton_gb/scrapper"
7
+ require "relaton_gb/gb_bibliographic_item"
8
+
9
+ module RelatonGb
10
+ # National standard scrapper.
11
+ module GbScrapper
12
+ extend Scrapper
13
+
14
+ class << self
15
+ # @param text [Strin] code of standard for serarch
16
+ # @return [RelatonGb::HitCollection]
17
+ def scrape_page(text)
18
+ search_html = OpenURI.open_uri(
19
+ "http://www.std.gov.cn/search/stdPage?q=" + text
20
+ )
21
+ result = Nokogiri::HTML search_html
22
+ hits = result.css(".s-title a").map do |h|
23
+ Hit.new pid: h[:pid], title: h.text, scrapper: self
24
+ end
25
+ HitCollection.new hits
26
+ rescue OpenURI::HTTPError, SocketError
27
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
28
+ end
29
+
30
+ # @param pid [Strin] standard's page id
31
+ # @return [RelatonGb::GbBibliographicItem]
32
+ def scrape_doc(pid)
33
+ src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
34
+ doc = Nokogiri::HTML OpenURI.open_uri(src)
35
+ GbBibliographicItem.new scrapped_data(doc, src: src)
36
+ rescue OpenURI::HTTPError, SocketError
37
+ warn "Cannot access http://www.std.gov.cn/gb/search/gbDetailed"
38
+ end
39
+
40
+ # @param doc [Nokogiri::HTML]
41
+ # @return [Hash]
42
+ # * :type [String]
43
+ # * :name [String]
44
+ def get_committee(doc)
45
+ name = doc.xpath("//p/a[1]/following-sibling::text()").text.
46
+ match(/(?<=()[^)]+/).to_s
47
+ { type: "technical", name: name }
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,30 @@
1
+ module RelatonGb
2
+ # GB standard type.
3
+ class GbStandardType
4
+ # @return [String]
5
+ attr_reader :scope
6
+
7
+ # @return [String]
8
+ attr_reader :prefix
9
+
10
+ # @return [String]
11
+ attr_reader :mandate
12
+
13
+ # @param scope [String]
14
+ # @param prefix [String]
15
+ # @param mandate [String]
16
+ def initialize(scope:, prefix:, mandate:)
17
+ @scope = scope
18
+ @prefix = prefix
19
+ @mandate = mandate
20
+ end
21
+
22
+ def to_xml(builder)
23
+ builder.gbtype do
24
+ builder.gbscope @scope
25
+ builder.gbprefix @prefix
26
+ builder.gbmandate @mandate
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ module RelatonGb
2
+ # GB technical committee.
3
+ class GbTechnicalCommittee
4
+ # @return [String]
5
+ attr_reader :type
6
+
7
+ # @return [String]
8
+ attr_reader :name
9
+
10
+ # @param type [String]
11
+ # @param name [String]
12
+ def initialize(type:, name:)
13
+ @type = type
14
+ @name = name
15
+ end
16
+
17
+ def to_xml(builder)
18
+ builder.gbcommittee(type: @type) do
19
+ builder.text @name
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ # Hit.
5
+ class Hit
6
+ # @return [Isobib::HitCollection]
7
+ attr_reader :hit_collection
8
+
9
+ # @return [String]
10
+ attr_reader :pid
11
+
12
+ # @return [String]
13
+ attr_reader :title
14
+
15
+ # @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
16
+ attr_reader :scrapper
17
+
18
+ # @param hit [Hash]
19
+ # @param hit_collection [Isobib:HitCollection]
20
+ def initialize(pid:, title:, hit_collection: nil, scrapper:)
21
+ @pid = pid
22
+ @title = title
23
+ @hit_collection = hit_collection
24
+ @scrapper = scrapper
25
+ self.hit_collection << self if hit_collection
26
+ end
27
+
28
+ # Parse page.
29
+ # @return [Isobib::IsoBibliographicItem]
30
+ def fetch
31
+ @fetch ||= scrapper.scrape_doc pid
32
+ end
33
+
34
+ # @return [String]
35
+ def to_s
36
+ inspect
37
+ end
38
+
39
+ # @return [String]
40
+ def inspect
41
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
42
+ "@fullIdentifier=\"#{@fetch&.shortref}\" "\
43
+ "@title=\"#{title}\">"
44
+ end
45
+
46
+ # @param builder [Nokogiri::XML::Builder]
47
+ # @param opts [Hash]
48
+ # @return [String]
49
+ # def to_xml(builder = nil, opts = {})
50
+ # if builder
51
+ # fetch.to_xml builder, opts
52
+ # else
53
+ # builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
54
+ # fetch.to_xml xml, opts
55
+ # end
56
+ # builder.doc.root.to_xml
57
+ # end
58
+ # end
59
+ end
60
+ end