relaton-gb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ # Page of hit collection
5
+ class HitCollection < Array
6
+ # @return [TrueClass, FalseClass]
7
+ attr_reader :fetched
8
+
9
+ # @return [Isobib::HitPages]
10
+ attr_reader :hit_pages
11
+
12
+ # @return [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
13
+ attr_reader :scrapper
14
+
15
+ # @param hits [Array<Hash>]
16
+ # @param hit_pages [Integer]
17
+ # @param scrapper [RelatonGb::GbScrapper, RelatonGb::SecScrapper, RelatonGb::TScrapper]
18
+ def initialize(hits = [], hit_pages = nil)
19
+ concat hits
20
+ @fetched = false
21
+ @hit_pages = hit_pages
22
+ end
23
+
24
+ # @return [RelatonGb::HitCollection]
25
+ # def fetch
26
+ # workers = RelatonBib::WorkersPool.new 4
27
+ # workers.worker(&:fetch)
28
+ # each do |hit|
29
+ # workers << hit
30
+ # end
31
+ # workers.end
32
+ # workers.result
33
+ # @fetched = true
34
+ # self
35
+ # end
36
+
37
+ def to_s
38
+ inspect
39
+ end
40
+
41
+ def inspect
42
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,197 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "yaml"
5
+ require "gb_agencies"
6
+
7
+ module RelatonGb
8
+ # Common scrapping methods.
9
+ module Scrapper
10
+ @prefixes = nil
11
+
12
+ # rubocop:disable Metrics/MethodLength
13
+ # @param doc [Nokogiri::HTML::Document]
14
+ # @param src [String] url of scrapped page
15
+ # @return [Hash]
16
+ def scrapped_data(doc, src:)
17
+ {
18
+ committee: get_committee(doc),
19
+ docid: get_docid(doc),
20
+ titles: get_titles(doc),
21
+ contributors: get_contributors(doc),
22
+ type: get_type(doc),
23
+ docstatus: get_status(doc),
24
+ gbtype: get_gbtype(doc),
25
+ ccs: get_ccs(doc),
26
+ ics: get_ics(doc),
27
+ link: [{ type: "src", content: src }],
28
+ dates: get_dates(doc),
29
+ language: ["zh"],
30
+ script: ["Hans"],
31
+ structuredidentifier: fetch_structuredidentifier(doc),
32
+ }
33
+ end
34
+ # rubocop:enable Metrics/MethodLength
35
+
36
+ # @param doc [Nokogiri::HTML::Document]
37
+ # @param xpt [String]
38
+ # @return [Array<RelatonBib::DocumentIdentifier>]
39
+ def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
40
+ item_ref = doc.at xpt
41
+ return [] unless item_ref
42
+
43
+ [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
44
+ end
45
+
46
+ # @param doc [Nokogiri::HTML::Document]
47
+ # @param xpt [String]
48
+ # @return [RelatonIsoBib::StructuredIdentifier]
49
+ def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
50
+ item_ref = doc.at xpt
51
+ unless item_ref
52
+ return RelatonIsoBib::StructuredIdentifier.new(
53
+ project_number: "?", part_number: "?", prefix: nil, id: "?",
54
+ type: "Chinese Standard"
55
+ )
56
+ end
57
+
58
+ m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
59
+ # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
60
+ RelatonIsoBib::StructuredIdentifier.new(
61
+ project_number: m[1], part_number: m[2], prefix: nil,
62
+ id: item_ref.text, type: "Chinese Standard"
63
+ )
64
+ end
65
+
66
+ def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
67
+ gb_en = GbAgencies::Agencies.new("en", {}, "")
68
+ gb_zh = GbAgencies::Agencies.new("zh", {}, "")
69
+ name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
70
+ name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
71
+ gbtype = get_gbtype(doc)
72
+ entity = RelatonBib::Organization.new name: [
73
+ { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
74
+ { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
75
+ ]
76
+ [{ entity: entity, roles: ["publisher"] }]
77
+ end
78
+
79
+ # @param doc [Nokogiri::HTML::Document]
80
+ # @return [Array<Hash>]
81
+ # * :title_intro [String]
82
+ # * :title_main [String]
83
+ # * :language [String]
84
+ # * :script [String]
85
+ def get_titles(doc)
86
+ titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
87
+ language: "zh", script: "Hans" }]
88
+ title_main = doc.css("div.page-header h5").text
89
+ unless title_main.empty?
90
+ titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
91
+ end
92
+ titles
93
+ end
94
+
95
+ def get_type(_doc)
96
+ "international-standard"
97
+ end
98
+
99
+ # @param doc [Nokogiri::HTML::Document]
100
+ # @param xpt [String]
101
+ # @return [RelatonBib::DocumentStatus]
102
+ def get_status(doc, xpt = ".s-status.label:nth-child(3)")
103
+ case doc.at(xpt).text.gsub(/\s/, "")
104
+ when "即将实施"
105
+ stage = "published"
106
+ when "现行"
107
+ stage = "activated"
108
+ when "废止"
109
+ stage = "obsoleted"
110
+ end
111
+ RelatonBib::DocumentStatus.new stage: stage
112
+ end
113
+
114
+ private
115
+
116
+ # @param doc [Nokogiri::HTML::Document]
117
+ # @return [Hash]
118
+ # * :scope [String]
119
+ # * :prefix [String]
120
+ # * :mandate [String]
121
+ def get_gbtype(doc)
122
+ ref = get_ref(doc)
123
+ { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
124
+ mandate: get_mandate(ref) }
125
+ end
126
+
127
+ # @param doc [Nokogiri::HTML::Document]
128
+ # @return [String]
129
+ def get_ref(doc)
130
+ doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
131
+ end
132
+
133
+ # @param doc [Nokogiri::HTML::Document]
134
+ # @return [Array<String>]
135
+ def get_ccs(doc)
136
+ [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
137
+ end
138
+
139
+ # @param doc [Nokogiri::HTML::Document]
140
+ # @return [Array<Hash>]
141
+ # * :field [String]
142
+ # * :group [String]
143
+ # * :subgroup [String]
144
+ def get_ics(doc)
145
+ ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
146
+ return [] if ics.empty?
147
+
148
+ field, group, subgroup = ics.text.split "."
149
+ [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
150
+ end
151
+
152
+ # @param doc [Nokogiri::HTML::Document]
153
+ # @return [String]
154
+ def get_scope(doc)
155
+ scope = doc.at(".s-status.label-info").text
156
+ if scope == "国家标准"
157
+ "national"
158
+ elsif scope =~ /^行业标准/
159
+ "sector"
160
+ end
161
+ end
162
+
163
+ # @param ref [String]
164
+ # @return [String]
165
+ def get_prefix(ref)
166
+ pref = ref.match(/^[^\s]+/).to_s.split("/").first
167
+ prefix pref
168
+ end
169
+
170
+ # @param pref [String]
171
+ # @return [Hash{String=>String}]
172
+ def prefix(pref)
173
+ file_path = File.join(__dir__, "yaml/prefixes.yaml")
174
+ @prefixes ||= YAML.load_file(file_path)
175
+ @prefixes[pref]
176
+ end
177
+
178
+ # @param ref [String]
179
+ # @return [String]
180
+ def get_mandate(ref)
181
+ case ref.match(%r{(?<=\/)[^\s]+}).to_s
182
+ when "T" then "recommended"
183
+ when "Z" then "guidelines"
184
+ else "mandatory"
185
+ end
186
+ end
187
+
188
+ # @param doc [Nokogiri::HTML::Document]
189
+ # @return [Array<Hash>]
190
+ # * :type [String] type of date
191
+ # * :on [String] date
192
+ def get_dates(doc)
193
+ date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
194
+ [{ type: "published", on: date }]
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "net/http"
5
+ require "json"
6
+ require "nokogiri"
7
+ require "relaton_gb/scrapper"
8
+ require "relaton_gb/gb_bibliographic_item"
9
+ require "relaton_gb/hit_collection"
10
+ require "relaton_gb/hit"
11
+
12
+ module RelatonGb
13
+ # Sector standard scrapper
14
+ module SecScrapper
15
+ extend Scrapper
16
+
17
+ class << self
18
+ # @param text [String] code of standard for serarch
19
+ # @return [RelatonGb::HitCollection]
20
+ def scrape_page(text)
21
+ uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
+ res = JSON.parse Net::HTTP.get(uri)
23
+ hits = res["rows"].map do |r|
24
+ Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
25
+ end
26
+ HitCollection.new hits
27
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
28
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
29
+ warn "Cannot access #{uri}"
30
+ end
31
+
32
+ # @param pid [String] standard's page id
33
+ # @return [RelatonGb::GbBibliographicItem]
34
+ def scrape_doc(pid)
35
+ src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
36
+ page_uri = URI src
37
+ doc = Nokogiri::HTML Net::HTTP.get(page_uri)
38
+ GbBibliographicItem.new scrapped_data(doc, src: src)
39
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
40
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
41
+ warn "Cannot access #{src}"
42
+ end
43
+
44
+ private
45
+
46
+ # @param doc [Nokogiri::HTML::Document]
47
+ # @return [Hash]
48
+ # * :type [String]
49
+ # * :name [String]
50
+ def get_committee(doc)
51
+ ref = get_ref(doc)
52
+ name = get_prefix(ref)["administration"]
53
+ { type: "technical", name: name }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,121 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require "open-uri"
5
+ require "nokogiri"
6
+ require "relaton_gb/scrapper"
7
+ require "relaton_gb/gb_bibliographic_item"
8
+ require "relaton_gb/hit_collection"
9
+ require "relaton_gb/hit"
10
+
11
+ module RelatonGb
12
+ # Social standard scarpper.
13
+ module TScrapper
14
+ extend Scrapper
15
+
16
+ class << self
17
+ # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
18
+ # @param text [String]
19
+ # @return [RelatonGb::HitCollection]
20
+ def scrape_page(text)
21
+ search_html = OpenURI.open_uri(
22
+ "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
23
+ CGI.escape(text.tr("-", [8212].pack("U"))),
24
+ )
25
+ header = Nokogiri::HTML search_html
26
+ xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
+ t_xpath = "../preceding-sibling::td[3]"
28
+ hits = header.xpath(xpath).map do |h|
29
+ title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
30
+ Hit.new pid: h[:href].sub(%r{\/$}, ""), title: title, scrapper: self
31
+ end
32
+ HitCollection.new hits
33
+ rescue OpenURI::HTTPError, SocketError
34
+ warn "Cannot access http://www.ttbz.org.cn/Home/Standard"
35
+ end
36
+ # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
37
+
38
+ # @param pid [String] standard's page path
39
+ # @return [RelatonGb::GbBibliographicItem]
40
+ def scrape_doc(pid)
41
+ src = "http://www.ttbz.org.cn#{pid}"
42
+ doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
43
+ GbBibliographicItem.new scrapped_data(doc, src: src)
44
+ rescue OpenURI::HTTPError, SocketError
45
+ warn "Cannot access #{src}"
46
+ end
47
+
48
+ private
49
+
50
+ # rubocop:disable Metrics/MethodLength
51
+ # @param doc [Nokogiri::HTML::Document]
52
+ # @return [Hash]
53
+ def scrapped_data(doc, src:)
54
+ docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
55
+ status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
56
+ {
57
+ committee: get_committee(doc),
58
+ docid: get_docid(doc, docid_xpt),
59
+ titles: get_titles(doc),
60
+ type: "international-standard",
61
+ docstatus: get_status(doc, status_xpt),
62
+ gbtype: gbtype,
63
+ ccs: get_ccs(doc),
64
+ ics: get_ics(doc),
65
+ link: [{ type: "src", content: src }],
66
+ dates: get_dates(doc),
67
+ language: ["zh"],
68
+ script: ["Hans"],
69
+ structuredidentifier: fetch_structuredidentifier(doc),
70
+ }
71
+ end
72
+ # rubocop:enable Metrics/MethodLength
73
+
74
+ def get_committee(doc)
75
+ {
76
+ name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
77
+ type: "technical",
78
+ }
79
+ end
80
+
81
+ def get_titles(doc)
82
+ xpath = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
83
+ titles = [{ title_main: doc.xpath(xpath).text,
84
+ title_intro: nil, language: "zh", script: "Hans" }]
85
+ xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
86
+ title_main = doc.xpath(xpath).text
87
+ unless title_main.empty?
88
+ titles << { title_main: title_main, title_intro: nil, language: "en",
89
+ script: "Latn" }
90
+ end
91
+ titles
92
+ end
93
+
94
+ def gbtype
95
+ { scope: "social-group", prefix: "T", mandate: "mandatory" }
96
+ end
97
+
98
+ # def get_group_code(ref)
99
+ # ref.match(%r{(?<=\/)[^\s]})
100
+ # end
101
+
102
+ def get_ccs(doc)
103
+ [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
104
+ .text.gsub(/[\r\n]/, "").strip.match(/^[^\s]+/).to_s]
105
+ end
106
+
107
+ def get_ics(doc)
108
+ xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
109
+ ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
110
+ field, group, subgroup = ics.split "."
111
+ [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
112
+ end
113
+
114
+ def get_dates(doc)
115
+ d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
116
+ .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
117
+ [{ type: "published", on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonGb
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,49 @@
1
+ require "nokogiri"
2
+
3
+ module RelatonGb
4
+ class XMLParser < RelatonIsoBib::XMLParser
5
+ class << self
6
+ def from_xml(xml)
7
+ doc = Nokogiri::XML(xml)
8
+ gbitem = doc.at "/bibitem|/bibdata"
9
+ GbBibliographicItem.new item_data(gbitem)
10
+ end
11
+
12
+ private
13
+
14
+ def item_data(gbitem)
15
+ data = super
16
+ data[:committee] = fetch_committee gbitem
17
+ data[:gbtype] = fetch_gbtype gbitem
18
+ data[:ccs] = fetch_ccs gbitem
19
+ data[:plannumber] = gbitem.at("./plannumber")&.text
20
+ data
21
+ end
22
+
23
+ # Overrade get_id from RelatonIsoBib::XMLParser
24
+ # def get_id(did)
25
+ # did.text.match(/^(?<project>.*?\d+)(?<hyphen>-)?(?(<hyphen>)(?<year>\d*))/)
26
+ # end
27
+
28
+ def fetch_committee(doc)
29
+ committee = doc.at "./ext/gbcommittee"
30
+ return nil unless committee
31
+
32
+ { type: committee[:type], name: committee.text }
33
+ end
34
+
35
+ def fetch_ccs(doc)
36
+ doc.xpath("./ext/ccs/code").map &:text
37
+ end
38
+
39
+ def fetch_gbtype(doc)
40
+ gbtype = doc.at "./ext/gbtype"
41
+ {
42
+ scope: gbtype&.at("gbscope")&.text,
43
+ prefix: gbtype&.at("gbprefix")&.text,
44
+ mandate: gbtype&.at("gbmandate")&.text,
45
+ }
46
+ end
47
+ end
48
+ end
49
+ end