relaton-gb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton/processor"
4
+
5
+ module Relaton
6
+ module RelatonGb
7
+ class Processor < Relaton::Processor
8
+
9
+ def initialize
10
+ @short = :relaton_gb
11
+ @prefix = "CN"
12
+ @defaultprefix = %r{^GB }
13
+ @idtype = "Chinese Standard"
14
+ end
15
+
16
+ def get(code, date, opts)
17
+ ::RelatonGb::GbBibliography.get(code, date, opts)
18
+ end
19
+
20
+ def from_xml(xml)
21
+ ::RelatonGb::XMLParser.from_xml xml
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,7 @@
1
+ require "relaton_gb/version"
2
+ require "relaton_gb/gb_bibliography"
3
+
4
+ if defined? Relaton
5
+ require_relative "relaton/processor"
6
+ Relaton::Registry.instance.register Relaton::RelatoGb::Processor
7
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso_bib"
4
+ require "cnccs"
5
+ require "relaton_gb/gb_technical_committee"
6
+ require "relaton_gb/gb_standard_type"
7
+ require "relaton_gb/xml_parser"
8
+
9
+ module RelatonGb
10
+ # GB bibliographic item class.
11
+ class GbBibliographicItem < RelatonIsoBib::IsoBibliographicItem
12
+ # @return [RelatonGb::GbTechnicalCommittee]
13
+ attr_reader :committee
14
+
15
+ # @return [RelatonGb::GbStandardType]
16
+ attr_reader :gbtype
17
+
18
+ # @return [String]
19
+ attr_reader :topic
20
+
21
+ # @return [Array<Cnccs::Ccs>]
22
+ attr_reader :ccs
23
+
24
+ # @return [String]
25
+ attr_reader :plan_number
26
+
27
+ # @return [String]
28
+ attr_reader :type, :gbplannumber
29
+
30
+ def initialize(**args)
31
+ super
32
+ args[:committee] && @committee = GbTechnicalCommittee.new(args[:committee])
33
+ @ccs = args[:ccs].map { |c| Cnccs.fetch c }
34
+ @gbtype = GbStandardType.new args[:gbtype]
35
+ @type = args[:type]
36
+ @gbplannumber = args[:gbplannumber] || structuredidentifier.project_number
37
+ end
38
+
39
+ # @param builder [Nokogiri::XML::Builder]
40
+ # @return [String]
41
+ def to_xml(builder = nil, **opts)
42
+ if builder
43
+ super(builder, **opts) { |xml| render_gbxml(xml) }
44
+ else
45
+ Nokogiri::XML::Builder.new(encoding: "UTF-8") do |bldr|
46
+ super(bldr, **opts) { |xml| render_gbxml(xml) }
47
+ end.doc.root.to_xml
48
+ end
49
+ end
50
+
51
+ # @return [String]
52
+ def inspect
53
+ "<#{self.class}:#{format('%#.14x', object_id << 1)}>"
54
+ # "@fullIdentifier=\"#{@fetch&.shortref}\" "\
55
+ # "@title=\"#{title}\">"
56
+ end
57
+
58
+ # @return [String]
59
+ def to_s
60
+ inspect
61
+ end
62
+
63
+ def makeid(id, attribute, _delim = "")
64
+ return nil if attribute && !@id_attribute
65
+
66
+ id ||= @docidentifier.reject { |i| i.type == "DOI" }[0]
67
+ idstr = id.id
68
+ # if id.part_number&.size&.positive?
69
+ # idstr = idstr + "-#{id.part_number}"
70
+ # end
71
+ idstr.gsub(/\s/, "").strip
72
+ end
73
+
74
+ private
75
+
76
+ # Overraides IsoBibliographicItem method.
77
+ # @param language [Array<String>]
78
+ # @raise ArgumentError
79
+ def check_language(language)
80
+ language.each do |lang|
81
+ unless %w[en zh].include? lang
82
+ raise ArgumentError, "invalid language: #{lang}"
83
+ end
84
+ end
85
+ end
86
+
87
+ # Overraides IsoBibliographicItem method.
88
+ # @param script [Array<String>]
89
+ # @raise ArgumentError
90
+ def check_script(script)
91
+ script.each do |scr|
92
+ raise ArgumentError, "invalid script: #{scr}" unless %w[Latn Hans].include? scr
93
+ end
94
+ end
95
+
96
+ # @param builder [Nokogiri::XML::Builder]
97
+ def render_gbxml(builder)
98
+ gbtype.to_xml builder
99
+ return unless ccs.any?
100
+
101
+ ccs.each do |c|
102
+ builder.ccs do
103
+ builder.code c.code
104
+ builder.text_ c.description
105
+ end
106
+ end
107
+
108
+ builder.gbplannumber gbplannumber if gbplannumber
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_gb/gb_bibliographic_item"
4
+ require "relaton_gb/gb_standard_type"
5
+ require "relaton_gb/hit_collection"
6
+ require "relaton_gb/hit"
7
+
8
+ # GB bib module.
9
+ module RelatonGb
10
+ # GB entry point class.
11
+ class GbBibliography
12
+ class << self
13
+ # rubocop:disable Metrics/MethodLength
14
+ # @param text [Strin] code of standard for search
15
+ # @return [RelatonGb::HitCollection]
16
+ def search(text)
17
+ if text =~ /^(GB|GJ|GS)/
18
+ # Scrape national standards.
19
+ require "relaton_gb/gb_scrapper"
20
+ GbScrapper.scrape_page text
21
+ elsif text =~ /^ZB/
22
+ # Scrape proffesional.
23
+ elsif text =~ /^DB/
24
+ # Scrape local standard.
25
+ elsif text =~ %r{^Q\/}
26
+ # Enterprise standard
27
+ elsif text =~ %r{^T\/[^\s]{3,6}\s}
28
+ # Scrape social standard.
29
+ require "relaton_gb/t_scrapper"
30
+ TScrapper.scrape_page text
31
+ else
32
+ # Scrape sector standard.
33
+ require "relaton_gb/sec_scrapper"
34
+ SecScrapper.scrape_page text
35
+ end
36
+ end
37
+ # rubocop:enable Metrics/MethodLength
38
+
39
+ # @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
40
+ # @param year [String] the year the standard was published (optional)
41
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
42
+ # @return [String] Relaton XML serialisation of reference
43
+ def get(code, year = nil, opts = {})
44
+ if year.nil?
45
+ /^(?<code1>[^-]+)-(?<year1>[^-]+)$/ =~ code
46
+ unless code1.nil?
47
+ code = code1
48
+ year = year1
49
+ end
50
+ end
51
+
52
+ code += ".1" if opts[:all_parts]
53
+ code, year = code.split(/-/, 2) if /-/ =~ code
54
+ ret = get1(code, year, opts)
55
+ return nil if ret.nil?
56
+
57
+ ret.to_most_recent_reference unless year
58
+ ret.to_all_parts if opts[:all_parts]
59
+ ret
60
+ end
61
+
62
+ private
63
+
64
+ def fetch_ref_err(code, year, missed_years)
65
+ id = year ? "#{code}:#{year}" : code
66
+ warn "WARNING: no match found on the GB website for #{id}. "\
67
+ "The code must be exactly like it is on the website."
68
+ warn "(There was no match for #{year}, though there were matches "\
69
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
70
+ if /\d-\d/ =~ code
71
+ warn "The provided document part may not exist, or the document "\
72
+ "may no longer be published in parts."
73
+ else
74
+ warn "If you wanted to cite all document parts for the reference, "\
75
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
76
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
77
+ end
78
+ nil
79
+ end
80
+
81
+ def get1(code, year, _opts)
82
+ # search must include year whenever available
83
+ searchcode = code + (year.nil? ? "" : "-#{year}")
84
+ result = search_filter(searchcode) || return
85
+ ret = results_filter(result, year)
86
+ return ret[:ret] if ret[:ret]
87
+
88
+ fetch_ref_err(code, year, ret[:years])
89
+ end
90
+
91
+ def search_filter(code)
92
+ # search filter needs to incorporate year
93
+ docidrx = %r{^[^\s]+\s[\d\.-]+}
94
+ # corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
95
+ warn "fetching #{code}..."
96
+ result = search(code)
97
+ ret = result.select do |hit|
98
+ hit.title && hit.title.match(docidrx).to_s == code # &&
99
+ # !corrigrx =~ hit.title
100
+ end
101
+ return ret unless ret.empty?
102
+
103
+ []
104
+ end
105
+
106
+ # Sort through the results from Isobib, fetching them three at a time,
107
+ # and return the first result that matches the code,
108
+ # matches the year (if provided), and which # has a title (amendments do not).
109
+ # Only expects the first page of results to be populated.
110
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
111
+ # If no match, returns any years which caused mismatch, for error reporting
112
+ def results_filter(result, year)
113
+ missed_years = []
114
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
115
+ fetch_pages(s, 3).each_with_index do |r, i|
116
+ return { ret: r } if !year
117
+
118
+ r.dates.select { |d| d.type == "published" }.each do |d|
119
+ return { ret: r } if year.to_i == d.on.year
120
+
121
+ missed_years << d.on.year
122
+ end
123
+ end
124
+ end
125
+ { years: missed_years }
126
+ end
127
+
128
+ def fetch_pages(s, n)
129
+ workers = RelatonBib::WorkersPool.new n
130
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
131
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
132
+ workers.end
133
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,51 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "open-uri"
5
+ require "nokogiri"
6
+ require "relaton_gb/scrapper"
7
+ require "relaton_gb/gb_bibliographic_item"
8
+
9
+ module RelatonGb
10
+ # National standard scrapper.
11
+ module GbScrapper
12
+ extend Scrapper
13
+
14
+ class << self
15
+ # @param text [Strin] code of standard for serarch
16
+ # @return [RelatonGb::HitCollection]
17
+ def scrape_page(text)
18
+ search_html = OpenURI.open_uri(
19
+ "http://www.std.gov.cn/search/stdPage?q=" + text
20
+ )
21
+ result = Nokogiri::HTML search_html
22
+ hits = result.css(".s-title a").map do |h|
23
+ Hit.new pid: h[:pid], title: h.text, scrapper: self
24
+ end
25
+ HitCollection.new hits
26
+ rescue OpenURI::HTTPError, SocketError
27
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
28
+ end
29
+
30
+ # @param pid [Strin] standard's page id
31
+ # @return [RelatonGb::GbBibliographicItem]
32
+ def scrape_doc(pid)
33
+ src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
34
+ doc = Nokogiri::HTML OpenURI.open_uri(src)
35
+ GbBibliographicItem.new scrapped_data(doc, src: src)
36
+ rescue OpenURI::HTTPError, SocketError
37
+ warn "Cannot access http://www.std.gov.cn/gb/search/gbDetailed"
38
+ end
39
+
40
+ # @param doc [Nokogiri::HTML]
41
+ # @return [Hash]
42
+ # * :type [String]
43
+ # * :name [String]
44
+ def get_committee(doc)
45
+ name = doc.xpath("//p/a[1]/following-sibling::text()").text.
46
+ match(/(?<=()[^)]+/).to_s
47
+ { type: "technical", name: name }
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,30 @@
1
+ module RelatonGb
2
+ # GB standard type.
3
+ class GbStandardType
4
+ # @return [String]
5
+ attr_reader :scope
6
+
7
+ # @return [String]
8
+ attr_reader :prefix
9
+
10
+ # @return [String]
11
+ attr_reader :mandate
12
+
13
+ # @param scope [String]
14
+ # @param prefix [String]
15
+ # @param mandate [String]
16
+ def initialize(scope:, prefix:, mandate:)
17
+ @scope = scope
18
+ @prefix = prefix
19
+ @mandate = mandate
20
+ end
21
+
22
+ def to_xml(builder)
23
+ builder.gbtype do
24
+ builder.gbscope @scope
25
+ builder.gbprefix @prefix
26
+ builder.gbmandate @mandate
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ module RelatonGb
2
+ # GB technical committee.
3
+ class GbTechnicalCommittee
4
+ # @return [String]
5
+ attr_reader :type
6
+
7
+ # @return [String]
8
+ attr_reader :name
9
+
10
+ # @param type [String]
11
+ # @param name [String]
12
+ def initialize(type:, name:)
13
+ @type = type
14
+ @name = name
15
+ end
16
+
17
+ def to_xml(builder)
18
+ builder.gbcommittee(type: @type) do
19
+ builder.text @name
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ # Hit.
5
+ class Hit
6
+ # @return [Isobib::HitCollection]
7
+ attr_reader :hit_collection
8
+
9
+ # @return [String]
10
+ attr_reader :pid
11
+
12
+ # @return [String]
13
+ attr_reader :title
14
+
15
+ # @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
16
+ attr_reader :scrapper
17
+
18
+ # @param hit [Hash]
19
+ # @param hit_collection [Isobib:HitCollection]
20
+ def initialize(pid:, title:, hit_collection: nil, scrapper:)
21
+ @pid = pid
22
+ @title = title
23
+ @hit_collection = hit_collection
24
+ @scrapper = scrapper
25
+ self.hit_collection << self if hit_collection
26
+ end
27
+
28
+ # Parse page.
29
+ # @return [Isobib::IsoBibliographicItem]
30
+ def fetch
31
+ @fetch ||= scrapper.scrape_doc pid
32
+ end
33
+
34
+ # @return [String]
35
+ def to_s
36
+ inspect
37
+ end
38
+
39
+ # @return [String]
40
+ def inspect
41
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
42
+ "@fullIdentifier=\"#{@fetch&.shortref}\" "\
43
+ "@title=\"#{title}\">"
44
+ end
45
+
46
+ # @param builder [Nokogiri::XML::Builder]
47
+ # @param opts [Hash]
48
+ # @return [String]
49
+ # def to_xml(builder = nil, opts = {})
50
+ # if builder
51
+ # fetch.to_xml builder, opts
52
+ # else
53
+ # builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
54
+ # fetch.to_xml xml, opts
55
+ # end
56
+ # builder.doc.root.to_xml
57
+ # end
58
+ # end
59
+ end
60
+ end