relaton-gb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ # Page of hit collection
5
+ class HitCollection < Array
6
+ # @return [TrueClass, FalseClass]
7
+ attr_reader :fetched
8
+
9
+ # @return [Isobib::HitPages]
10
+ attr_reader :hit_pages
11
+
12
+ # @return [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
13
+ attr_reader :scrapper
14
+
15
+ # @param hits [Array<Hash>]
16
+ # @param hit_pages [Integer]
17
+ # @param scrapper [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
18
+ def initialize(hits = [], hit_pages = nil)
19
+ concat hits
20
+ @fetched = false
21
+ @hit_pages = hit_pages
22
+ end
23
+
24
+ # @return [RelatonGb::HitCollection]
25
+ # def fetch
26
+ # workers = RelatonBib::WorkersPool.new 4
27
+ # workers.worker(&:fetch)
28
+ # each do |hit|
29
+ # workers << hit
30
+ # end
31
+ # workers.end
32
+ # workers.result
33
+ # @fetched = true
34
+ # self
35
+ # end
36
+
37
+ def to_s
38
+ inspect
39
+ end
40
+
41
+ def inspect
42
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,197 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "yaml"
5
+ require "gb_agencies"
6
+
7
+ module RelatonGb
8
+ # Common scrapping methods.
9
+ module Scrapper
10
+ @prefixes = nil
11
+
12
+ # rubocop:disable Metrics/MethodLength
13
+ # @param doc [Nokogiri::HTML::Document]
14
+ # @param src [String] url of scrapped page
15
+ # @return [Hash]
16
+ def scrapped_data(doc, src:)
17
+ {
18
+ committee: get_committee(doc),
19
+ docid: get_docid(doc),
20
+ titles: get_titles(doc),
21
+ contributors: get_contributors(doc),
22
+ type: get_type(doc),
23
+ docstatus: get_status(doc),
24
+ gbtype: get_gbtype(doc),
25
+ ccs: get_ccs(doc),
26
+ ics: get_ics(doc),
27
+ link: [{ type: "src", content: src }],
28
+ dates: get_dates(doc),
29
+ language: ["zh"],
30
+ script: ["Hans"],
31
+ structuredidentifier: fetch_structuredidentifier(doc),
32
+ }
33
+ end
34
+ # rubocop:enable Metrics/MethodLength
35
+
36
+ # @param doc [Nokogiri::HTML::Document]
37
+ # @param xpt [String]
38
+ # @return [Array<RelatonBib::DocumentIdentifier>]
39
+ def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
40
+ item_ref = doc.at xpt
41
+ return [] unless item_ref
42
+
43
+ [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
44
+ end
45
+
46
+ # @param doc [Nokogiri::HTML::Document]
47
+ # @param xpt [String]
48
+ # @return [RelatonIsoBib::StructuredIdentifier]
49
+ def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
50
+ item_ref = doc.at xpt
51
+ unless item_ref
52
+ return RelatonIsoBib::StructuredIdentifier.new(
53
+ project_number: "?", part_number: "?", prefix: nil, id: "?",
54
+ type: "Chinese Standard"
55
+ )
56
+ end
57
+
58
+ m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
59
+ # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
60
+ RelatonIsoBib::StructuredIdentifier.new(
61
+ project_number: m[1], part_number: m[2], prefix: nil,
62
+ id: item_ref.text, type: "Chinese Standard"
63
+ )
64
+ end
65
+
66
+ def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
67
+ gb_en = GbAgencies::Agencies.new("en", {}, "")
68
+ gb_zh = GbAgencies::Agencies.new("zh", {}, "")
69
+ name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
70
+ name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
71
+ gbtype = get_gbtype(doc)
72
+ entity = RelatonBib::Organization.new name: [
73
+ { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
74
+ { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
75
+ ]
76
+ [{ entity: entity, roles: ["publisher"] }]
77
+ end
78
+
79
+ # @param doc [Nokogiri::HTML::Document]
80
+ # @return [Array<Hash>]
81
+ # * :title_intro [String]
82
+ # * :title_main [String]
83
+ # * :language [String]
84
+ # * :script [String]
85
+ def get_titles(doc)
86
+ titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
87
+ language: "zh", script: "Hans" }]
88
+ title_main = doc.css("div.page-header h5").text
89
+ unless title_main.empty?
90
+ titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
91
+ end
92
+ titles
93
+ end
94
+
95
+ def get_type(_doc)
96
+ "international-standard"
97
+ end
98
+
99
+ # @param doc [Nokogiri::HTML::Document]
100
+ # @param xpt [String]
101
+ # @return [RelatonBib::DocumentStatus]
102
+ def get_status(doc, xpt = ".s-status.label:nth-child(3)")
103
+ case doc.at(xpt).text.gsub(/\s/, "")
104
+ when "即将实施"
105
+ stage = "published"
106
+ when "现行"
107
+ stage = "activated"
108
+ when "废止"
109
+ stage = "obsoleted"
110
+ end
111
+ RelatonBib::DocumentStatus.new stage: stage
112
+ end
113
+
114
+ private
115
+
116
+ # @param doc [Nokogiri::HTML::Document]
117
+ # @return [Hash]
118
+ # * :scope [String]
119
+ # * :prefix [String]
120
+ # * :mandate [String]
121
+ def get_gbtype(doc)
122
+ ref = get_ref(doc)
123
+ { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
124
+ mandate: get_mandate(ref) }
125
+ end
126
+
127
+ # @param doc [Nokogiri::HTML::Document]
128
+ # @return [String]
129
+ def get_ref(doc)
130
+ doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
131
+ end
132
+
133
+ # @param doc [Nokogiri::HTML::Document]
134
+ # @return [Array<String>]
135
+ def get_ccs(doc)
136
+ [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
137
+ end
138
+
139
+ # @param doc [Nokogiri::HTML::Document]
140
+ # @return [Array<Hash>]
141
+ # * :field [String]
142
+ # * :group [String]
143
+ # * :subgroup [String]
144
+ def get_ics(doc)
145
+ ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
146
+ return [] if ics.empty?
147
+
148
+ field, group, subgroup = ics.text.split "."
149
+ [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
150
+ end
151
+
152
+ # @param doc [Nokogiri::HTML::Document]
153
+ # @return [String]
154
+ def get_scope(doc)
155
+ scope = doc.at(".s-status.label-info").text
156
+ if scope == "国家标准"
157
+ "national"
158
+ elsif scope =~ /^行业标准/
159
+ "sector"
160
+ end
161
+ end
162
+
163
+ # @param ref [String]
164
+ # @return [String]
165
+ def get_prefix(ref)
166
+ pref = ref.match(/^[^\s]+/).to_s.split("/").first
167
+ prefix pref
168
+ end
169
+
170
+ # @param pref [String]
171
+ # @return [Hash{String=>String}]
172
+ def prefix(pref)
173
+ file_path = File.join(__dir__, "yaml/prefixes.yaml")
174
+ @prefixes ||= YAML.load_file(file_path)
175
+ @prefixes[pref]
176
+ end
177
+
178
+ # @param ref [String]
179
+ # @return [String]
180
+ def get_mandate(ref)
181
+ case ref.match(%r{(?<=\/)[^\s]+}).to_s
182
+ when "T" then "recommended"
183
+ when "Z" then "guidelines"
184
+ else "mandatory"
185
+ end
186
+ end
187
+
188
+ # @param doc [Nokogiri::HTML::Document]
189
+ # @return [Array<Hash>]
190
+ # * :type [String] type of date
191
+ # * :on [String] date
192
+ def get_dates(doc)
193
+ date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
194
+ [{ type: "published", on: date }]
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "net/http"
5
+ require "json"
6
+ require "nokogiri"
7
+ require "relaton_gb/scrapper"
8
+ require "relaton_gb/gb_bibliographic_item"
9
+ require "relaton_gb/hit_collection"
10
+ require "relaton_gb/hit"
11
+
12
+ module RelatonGb
13
+ # Sector standard scrapper
14
+ module SecScrapper
15
+ extend Scrapper
16
+
17
+ class << self
18
+ # @param text [String] code of standard for serarch
19
+ # @return [RelatonGb::HitCollection]
20
+ def scrape_page(text)
21
+ uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
+ res = JSON.parse Net::HTTP.get(uri)
23
+ hits = res["rows"].map do |r|
24
+ Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
25
+ end
26
+ HitCollection.new hits
27
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
28
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
29
+ warn "Cannot access #{uri}"
30
+ end
31
+
32
+ # @param pid [String] standard's page id
33
+ # @return [RelatonGb::GbBibliographicItem]
34
+ def scrape_doc(pid)
35
+ src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
36
+ page_uri = URI src
37
+ doc = Nokogiri::HTML Net::HTTP.get(page_uri)
38
+ GbBibliographicItem.new scrapped_data(doc, src: src)
39
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
40
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
41
+ warn "Cannot access #{src}"
42
+ end
43
+
44
+ private
45
+
46
+ # @param doc [Nokogiri::HTML::Document]
47
+ # @return [Hash]
48
+ # * :type [String]
49
+ # * :name [String]
50
+ def get_committee(doc)
51
+ ref = get_ref(doc)
52
+ name = get_prefix(ref)["administration"]
53
+ { type: "technical", name: name }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,121 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "open-uri"
5
+ require "nokogiri"
6
+ require "relaton_gb/scrapper"
7
+ require "relaton_gb/gb_bibliographic_item"
8
+ require "relaton_gb/hit_collection"
9
+ require "relaton_gb/hit"
10
+
11
+ module RelatonGb
12
+ # Social standard scarpper.
13
+ module TScrapper
14
+ extend Scrapper
15
+
16
+ class << self
17
+ # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
18
+ # @param text [String]
19
+ # @return [RelatonGb::HitCollection]
20
+ def scrape_page(text)
21
+ search_html = OpenURI.open_uri(
22
+ "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
23
+ CGI.escape(text.tr("-", [8212].pack("U"))),
24
+ )
25
+ header = Nokogiri::HTML search_html
26
+ xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
+ t_xpath = "../preceding-sibling::td[3]"
28
+ hits = header.xpath(xpath).map do |h|
29
+ title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
30
+ Hit.new pid: h[:href].sub(%r{\/$}, ""), title: title, scrapper: self
31
+ end
32
+ HitCollection.new hits
33
+ rescue OpenURI::HTTPError, SocketError
34
+ warn "Cannot access http://www.ttbz.org.cn/Home/Standard"
35
+ end
36
+ # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
37
+
38
+ # @param pid [String] standard's page path
39
+ # @return [RelatonGb::GbBibliographicItem]
40
+ def scrape_doc(pid)
41
+ src = "http://www.ttbz.org.cn#{pid}"
42
+ doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
43
+ GbBibliographicItem.new scrapped_data(doc, src: src)
44
+ rescue OpenURI::HTTPError, SocketError
45
+ warn "Cannot access #{src}"
46
+ end
47
+
48
+ private
49
+
50
+ # rubocop:disable Metrics/MethodLength
51
+ # @param doc [Nokogiri::HTML::Document]
52
+ # @return [Hash]
53
+ def scrapped_data(doc, src:)
54
+ docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
55
+ status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
56
+ {
57
+ committee: get_committee(doc),
58
+ docid: get_docid(doc, docid_xpt),
59
+ titles: get_titles(doc),
60
+ type: "international-standard",
61
+ docstatus: get_status(doc, status_xpt),
62
+ gbtype: gbtype,
63
+ ccs: get_ccs(doc),
64
+ ics: get_ics(doc),
65
+ link: [{ type: "src", content: src }],
66
+ dates: get_dates(doc),
67
+ language: ["zh"],
68
+ script: ["Hans"],
69
+ structuredidentifier: fetch_structuredidentifier(doc),
70
+ }
71
+ end
72
+ # rubocop:enable Metrics/MethodLength
73
+
74
+ def get_committee(doc)
75
+ {
76
+ name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
77
+ type: "technical",
78
+ }
79
+ end
80
+
81
+ def get_titles(doc)
82
+ xpath = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
83
+ titles = [{ title_main: doc.xpath(xpath).text,
84
+ title_intro: nil, language: "zh", script: "Hans" }]
85
+ xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
86
+ title_main = doc.xpath(xpath).text
87
+ unless title_main.empty?
88
+ titles << { title_main: title_main, title_intro: nil, language: "en",
89
+ script: "Latn" }
90
+ end
91
+ titles
92
+ end
93
+
94
+ def gbtype
95
+ { scope: "social-group", prefix: "T", mandate: "mandatory" }
96
+ end
97
+
98
+ # def get_group_code(ref)
99
+ # ref.match(%r{(?<=\/)[^\s]})
100
+ # end
101
+
102
+ def get_ccs(doc)
103
+ [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
104
+ .text.gsub(/[\r\n]/, "").strip.match(/^[^\s]+/).to_s]
105
+ end
106
+
107
+ def get_ics(doc)
108
+ xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
109
+ ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
110
+ field, group, subgroup = ics.split "."
111
+ [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
112
+ end
113
+
114
+ def get_dates(doc)
115
+ d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
116
+ .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
117
+ [{ type: "published", on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,49 @@
1
+ require "nokogiri"
2
+
3
+ module RelatonGb
4
+ class XMLParser < RelatonIsoBib::XMLParser
5
+ class << self
6
+ def from_xml(xml)
7
+ doc = Nokogiri::XML(xml)
8
+ gbitem = doc.at "/bibitem|/bibdata"
9
+ GbBibliographicItem.new item_data(gbitem)
10
+ end
11
+
12
+ private
13
+
14
+ def item_data(gbitem)
15
+ data = super
16
+ data[:committee] = fetch_committee gbitem
17
+ data[:gbtype] = fetch_gbtype gbitem
18
+ data[:ccs] = fetch_ccs gbitem
19
+ data[:plannumber] = gbitem.at("./plannumber")&.text
20
+ data
21
+ end
22
+
23
+ # Overrade get_id from RelatonIsoBib::XMLParser
24
+ # def get_id(did)
25
+ # did.text.match(/^(?<project>.*?\d+)(?<hyphen>-)?(?(<hyphen>)(?<year>\d*))/)
26
+ # end
27
+
28
+ def fetch_committee(doc)
29
+ committee = doc.at "./ext/gbcommittee"
30
+ return nil unless committee
31
+
32
+ { type: committee[:type], name: committee.text }
33
+ end
34
+
35
+ def fetch_ccs(doc)
36
+ doc.xpath("./ext/ccs/code").map &:text
37
+ end
38
+
39
+ def fetch_gbtype(doc)
40
+ gbtype = doc.at "./ext/gbtype"
41
+ {
42
+ scope: gbtype&.at("gbscope")&.text,
43
+ prefix: gbtype&.at("gbprefix")&.text,
44
+ mandate: gbtype&.at("gbmandate")&.text,
45
+ }
46
+ end
47
+ end
48
+ end
49
+ end