relaton-gb 0.6.2 → 0.6.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -7
- data/lib/relaton_gb/gb_bibliography.rb +9 -11
- data/lib/relaton_gb/gb_scrapper.rb +18 -13
- data/lib/relaton_gb/hit.rb +19 -25
- data/lib/relaton_gb/processor.rb +35 -0
- data/lib/relaton_gb/scrapper.rb +55 -66
- data/lib/relaton_gb/sec_scrapper.rb +47 -12
- data/lib/relaton_gb/t_scrapper.rb +22 -17
- data/lib/relaton_gb/version.rb +1 -1
- data/lib/relaton_gb.rb +7 -4
- data/relaton_gb.gemspec +2 -0
- metadata +31 -3
- data/lib/relaton/processor.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64d418169f8f5f6c500a43e913ea94d13d4dca7d
|
4
|
+
data.tar.gz: 2717d29fdc48172660535462cade3eb856dcf362
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e372dd88445fdb97b2bda4c41b755bb8b325ade7c271f0db0e567c697c7b4c80b6d1da24c7af1e8439fee87e9e9cf1b2ff2940e1dd86d5146c1efa73ec048e4b
|
7
|
+
data.tar.gz: ee2e5d7edc8f836505a024bff24e543fa70a05d9508885a3ead36b798a0750aa661a825f3f33250b00740a78b5edfb02d5610d616902aa7899d3ea75046e7594
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
relaton-gb (0.6.
|
4
|
+
relaton-gb (0.6.3)
|
5
5
|
cnccs (~> 0.1.1)
|
6
6
|
gb-agencies (~> 0.0.1)
|
7
7
|
relaton-iso-bib (~> 0.3.0)
|
@@ -9,11 +9,13 @@ PATH
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
14
|
byebug (11.0.1)
|
15
15
|
cnccs (0.1.3)
|
16
16
|
coderay (1.1.2)
|
17
|
+
crack (0.4.3)
|
18
|
+
safe_yaml (~> 1.0.0)
|
17
19
|
debase (0.2.2)
|
18
20
|
debase-ruby_core_source (>= 0.10.2)
|
19
21
|
debase-ruby_core_source (0.10.5)
|
@@ -22,6 +24,7 @@ GEM
|
|
22
24
|
equivalent-xml (0.6.0)
|
23
25
|
nokogiri (>= 1.4.3)
|
24
26
|
gb-agencies (0.0.5)
|
27
|
+
hashdiff (1.0.0)
|
25
28
|
isoics (0.1.7)
|
26
29
|
json (2.2.0)
|
27
30
|
method_source (0.9.2)
|
@@ -34,12 +37,12 @@ GEM
|
|
34
37
|
pry-byebug (3.7.0)
|
35
38
|
byebug (~> 11.0)
|
36
39
|
pry (~> 0.10)
|
37
|
-
public_suffix (
|
40
|
+
public_suffix (4.0.1)
|
38
41
|
rake (10.5.0)
|
39
|
-
relaton-bib (0.3.
|
42
|
+
relaton-bib (0.3.6)
|
40
43
|
addressable
|
41
|
-
nokogiri
|
42
|
-
relaton-iso-bib (0.3.
|
44
|
+
nokogiri
|
45
|
+
relaton-iso-bib (0.3.5)
|
43
46
|
isoics (~> 0.1.6)
|
44
47
|
relaton-bib (~> 0.3.0)
|
45
48
|
ruby_deep_clone (~> 0.8.0)
|
@@ -59,11 +62,17 @@ GEM
|
|
59
62
|
ruby-debug-ide (0.7.0)
|
60
63
|
rake (>= 0.8.1)
|
61
64
|
ruby_deep_clone (0.8.0)
|
65
|
+
safe_yaml (1.0.5)
|
62
66
|
simplecov (0.16.1)
|
63
67
|
docile (~> 1.1)
|
64
68
|
json (>= 1.8, < 3)
|
65
69
|
simplecov-html (~> 0.10.0)
|
66
70
|
simplecov-html (0.10.2)
|
71
|
+
vcr (5.0.0)
|
72
|
+
webmock (3.7.0)
|
73
|
+
addressable (>= 2.3.6)
|
74
|
+
crack (>= 0.3.2)
|
75
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
67
76
|
|
68
77
|
PLATFORMS
|
69
78
|
ruby
|
@@ -78,6 +87,8 @@ DEPENDENCIES
|
|
78
87
|
rspec (~> 3.0)
|
79
88
|
ruby-debug-ide
|
80
89
|
simplecov
|
90
|
+
vcr
|
91
|
+
webmock
|
81
92
|
|
82
93
|
BUNDLED WITH
|
83
94
|
2.0.1
|
@@ -91,16 +91,11 @@ module RelatonGb
|
|
91
91
|
def search_filter(code)
|
92
92
|
# search filter needs to incorporate year
|
93
93
|
docidrx = %r{^[^\s]+\s[\d\.-]+}
|
94
|
-
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
95
94
|
warn "fetching #{code}..."
|
96
95
|
result = search(code)
|
97
|
-
|
98
|
-
hit.
|
99
|
-
# !corrigrx =~ hit.title
|
96
|
+
result.select do |hit|
|
97
|
+
hit.docref && hit.docref.match(docidrx).to_s.include?(code)
|
100
98
|
end
|
101
|
-
return ret unless ret.empty?
|
102
|
-
|
103
|
-
[]
|
104
99
|
end
|
105
100
|
|
106
101
|
# Sort through the results from Isobib, fetching them three at a time,
|
@@ -125,12 +120,15 @@ module RelatonGb
|
|
125
120
|
{ years: missed_years }
|
126
121
|
end
|
127
122
|
|
128
|
-
|
129
|
-
|
123
|
+
# @param hits [RelatonBib::HitCollection<RelatonBib::Hit>]
|
124
|
+
# @param threads [Integer]
|
125
|
+
# @return [Array<RelatonBib::GbBibliographicItem>]
|
126
|
+
def fetch_pages(hits, threads)
|
127
|
+
workers = RelatonBib::WorkersPool.new threads
|
130
128
|
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
131
|
-
|
129
|
+
hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
132
130
|
workers.end
|
133
|
-
workers.result.
|
131
|
+
workers.result.sort_by { |x| x[:i] }.map { |x| x[:hit] }
|
134
132
|
end
|
135
133
|
end
|
136
134
|
end
|
@@ -16,35 +16,40 @@ module RelatonGb
|
|
16
16
|
# @return [RelatonGb::HitCollection]
|
17
17
|
def scrape_page(text)
|
18
18
|
search_html = OpenURI.open_uri(
|
19
|
-
"http://
|
19
|
+
"http://openstd.samr.gov.cn/bzgk/gb/std_list?p.p2=" + text
|
20
20
|
)
|
21
21
|
result = Nokogiri::HTML search_html
|
22
|
-
hits = result.
|
23
|
-
|
22
|
+
hits = result.xpath(
|
23
|
+
"//table[contains(@class, 'result_list')]/tbody[2]/tr",
|
24
|
+
).map do |h|
|
25
|
+
ref = h.at "./td[2]/a"
|
26
|
+
pid = ref[:onclick].match(/[0-9A-F]+/).to_s
|
27
|
+
rdate = h.at("./td[7]").text
|
28
|
+
Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
|
24
29
|
end
|
25
|
-
HitCollection.new hits
|
30
|
+
HitCollection.new hits.sort_by(&:release_date).reverse
|
26
31
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
27
|
-
raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/
|
32
|
+
raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/bzgk/gb/std_list"
|
28
33
|
end
|
29
34
|
|
30
|
-
# @param
|
35
|
+
# @param hit [RelatonGb::Hit] standard's page id
|
31
36
|
# @return [RelatonGb::GbBibliographicItem]
|
32
|
-
def scrape_doc(
|
33
|
-
src = "http://
|
37
|
+
def scrape_doc(hit)
|
38
|
+
src = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" + hit.pid
|
34
39
|
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
35
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
40
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
36
41
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
37
42
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
38
43
|
end
|
39
44
|
|
40
45
|
# @param doc [Nokogiri::HTML]
|
46
|
+
# @param _ref [String]
|
41
47
|
# @return [Hash]
|
42
48
|
# * :type [String]
|
43
49
|
# * :name [String]
|
44
|
-
def get_committee(doc)
|
45
|
-
name = doc.
|
46
|
-
|
47
|
-
{ type: "technical", name: name }
|
50
|
+
def get_committee(doc, _ref)
|
51
|
+
name = doc.at("//div[contains(text(), '归口单位')]/following-sibling::div")
|
52
|
+
{ type: "technical", name: name.text.delete("\r\n\t\t") }
|
48
53
|
end
|
49
54
|
end
|
50
55
|
end
|
data/lib/relaton_gb/hit.rb
CHANGED
@@ -7,28 +7,36 @@ module RelatonGb
|
|
7
7
|
attr_reader :hit_collection
|
8
8
|
|
9
9
|
# @return [String]
|
10
|
-
attr_reader :pid
|
10
|
+
attr_reader :pid, :docref
|
11
11
|
|
12
|
-
# @return [
|
13
|
-
attr_reader :
|
12
|
+
# @return [Date, NilClass]
|
13
|
+
attr_reader :release_date
|
14
|
+
|
15
|
+
# @return [String, NilClass]
|
16
|
+
attr_reader :status
|
14
17
|
|
15
18
|
# @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
16
19
|
attr_reader :scrapper
|
17
20
|
|
18
|
-
# @param
|
19
|
-
# @param
|
20
|
-
|
21
|
+
# @param pid [String]
|
22
|
+
# @param docref [String]
|
23
|
+
# @parma scrapper [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
24
|
+
# @param release_date [String]
|
25
|
+
# @status [String, NilClass]
|
26
|
+
# @param hit_collection [RelatonGb:HitCollection, NilClass]
|
27
|
+
def initialize(pid:, docref:, scrapper:, **args)
|
21
28
|
@pid = pid
|
22
|
-
@
|
23
|
-
@hit_collection = hit_collection
|
29
|
+
@docref = docref
|
24
30
|
@scrapper = scrapper
|
25
|
-
|
31
|
+
@release_date = Date.parse args[:release_date] if args[:release_date]
|
32
|
+
@status = args[:status]
|
33
|
+
@hit_collection = args[:hit_collection]
|
26
34
|
end
|
27
35
|
|
28
36
|
# Parse page.
|
29
37
|
# @return [Isobib::IsoBibliographicItem]
|
30
38
|
def fetch
|
31
|
-
@fetch ||= scrapper.scrape_doc
|
39
|
+
@fetch ||= scrapper.scrape_doc self
|
32
40
|
end
|
33
41
|
|
34
42
|
# @return [String]
|
@@ -40,21 +48,7 @@ module RelatonGb
|
|
40
48
|
def inspect
|
41
49
|
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
42
50
|
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
43
|
-
"@
|
51
|
+
"@docref=\"#{docref}\">"
|
44
52
|
end
|
45
|
-
|
46
|
-
# @param builder [Nokogiri::XML::Builder]
|
47
|
-
# @param opts [Hash]
|
48
|
-
# @return [String]
|
49
|
-
# def to_xml(builder = nil, opts = {})
|
50
|
-
# if builder
|
51
|
-
# fetch.to_xml builder, opts
|
52
|
-
# else
|
53
|
-
# builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
54
|
-
# fetch.to_xml xml, opts
|
55
|
-
# end
|
56
|
-
# builder.doc.root.to_xml
|
57
|
-
# end
|
58
|
-
# end
|
59
53
|
end
|
60
54
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton/processor"
|
4
|
+
|
5
|
+
module RelatonGb
|
6
|
+
class Processor < Relaton::Processor
|
7
|
+
def initialize
|
8
|
+
@short = :relaton_gb
|
9
|
+
@prefix = "CN"
|
10
|
+
@defaultprefix = %r{^GB }
|
11
|
+
@idtype = "Chinese Standard"
|
12
|
+
end
|
13
|
+
|
14
|
+
# @param code [String]
|
15
|
+
# @param date [String, NilClass] year
|
16
|
+
# @param opts [Hash]
|
17
|
+
# @return [RelatonGb::GbBibliographicItem]
|
18
|
+
def get(code, date, opts)
|
19
|
+
::RelatonGb::GbBibliography.get(code, date, opts)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param xml [String]
|
23
|
+
# @return [RelatonGb::GbBibliographicItem]
|
24
|
+
def from_xml(xml)
|
25
|
+
::RelatonGb::XMLParser.from_xml xml
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param hash [Hash]
|
29
|
+
# @return [RelatonGb::GbBibliographicItem]
|
30
|
+
def hash_to_bib(hash)
|
31
|
+
item_hash = ::RelatonGb::HashConverter.hash_to_bib(hash)
|
32
|
+
::RelatonGb::GbBibliographicItem.new item_hash
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/relaton_gb/scrapper.rb
CHANGED
@@ -11,64 +11,54 @@ module RelatonGb
|
|
11
11
|
|
12
12
|
# rubocop:disable Metrics/MethodLength
|
13
13
|
# @param doc [Nokogiri::HTML::Document]
|
14
|
-
# @param src [String]
|
14
|
+
# @param src [String]
|
15
|
+
# @param hit [RelatonGb::Hit]
|
15
16
|
# @return [Hash]
|
16
|
-
def scrapped_data(doc, src
|
17
|
+
def scrapped_data(doc, src, hit)
|
17
18
|
{
|
18
|
-
committee: get_committee(doc),
|
19
|
-
docid: get_docid(
|
19
|
+
committee: get_committee(doc, hit.docref),
|
20
|
+
docid: get_docid(hit.docref),
|
20
21
|
title: get_titles(doc),
|
21
|
-
contributor: get_contributors(doc),
|
22
|
-
type: get_type
|
23
|
-
docstatus: get_status(doc),
|
24
|
-
gbtype: get_gbtype(doc),
|
22
|
+
contributor: get_contributors(doc, hit.docref),
|
23
|
+
type: get_type,
|
24
|
+
docstatus: get_status(doc, hit.status),
|
25
|
+
gbtype: get_gbtype(doc, hit.docref),
|
25
26
|
ccs: get_ccs(doc),
|
26
27
|
ics: get_ics(doc),
|
27
28
|
link: [{ type: "src", content: src }],
|
28
29
|
date: get_dates(doc),
|
29
30
|
language: ["zh"],
|
30
31
|
script: ["Hans"],
|
31
|
-
structuredidentifier: fetch_structuredidentifier(
|
32
|
+
structuredidentifier: fetch_structuredidentifier(hit.docref),
|
32
33
|
}
|
33
34
|
end
|
34
35
|
# rubocop:enable Metrics/MethodLength
|
35
36
|
|
36
|
-
# @param
|
37
|
-
# @param xpt [String]
|
37
|
+
# @param docref [String]
|
38
38
|
# @return [Array<RelatonBib::DocumentIdentifier>]
|
39
|
-
def get_docid(
|
40
|
-
|
41
|
-
return [] unless item_ref
|
42
|
-
|
43
|
-
[RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
|
39
|
+
def get_docid(docref)
|
40
|
+
[RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
|
44
41
|
end
|
45
42
|
|
46
|
-
# @param
|
47
|
-
# @param xpt [String]
|
43
|
+
# @param docref [String]
|
48
44
|
# @return [RelatonIsoBib::StructuredIdentifier]
|
49
|
-
def fetch_structuredidentifier(
|
50
|
-
|
51
|
-
unless item_ref
|
52
|
-
return RelatonIsoBib::StructuredIdentifier.new(
|
53
|
-
project_number: "?", part_number: "?", prefix: nil, id: "?",
|
54
|
-
type: "Chinese Standard"
|
55
|
-
)
|
56
|
-
end
|
57
|
-
|
58
|
-
m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
|
59
|
-
# prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
|
45
|
+
def fetch_structuredidentifier(docref)
|
46
|
+
m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
|
60
47
|
RelatonIsoBib::StructuredIdentifier.new(
|
61
48
|
project_number: m[1], part_number: m[2], prefix: nil,
|
62
|
-
id:
|
49
|
+
id: docref, type: "Chinese Standard"
|
63
50
|
)
|
64
51
|
end
|
65
52
|
|
66
|
-
|
53
|
+
# @param doc [Nokogiri::HTML::Document]
|
54
|
+
# @param docref [Strings]
|
55
|
+
# @return [Array<Hash>]
|
56
|
+
def get_contributors(doc, docref)
|
67
57
|
gb_en = GbAgencies::Agencies.new("en", {}, "")
|
68
58
|
gb_zh = GbAgencies::Agencies.new("zh", {}, "")
|
69
|
-
name =
|
59
|
+
name = docref.match(/^[^\s]+/).to_s
|
70
60
|
name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
|
71
|
-
gbtype = get_gbtype(doc)
|
61
|
+
gbtype = get_gbtype(doc, docref)
|
72
62
|
entity = RelatonBib::Organization.new name: [
|
73
63
|
{ language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
|
74
64
|
{ language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
|
@@ -83,57 +73,56 @@ module RelatonGb
|
|
83
73
|
# * :language [String]
|
84
74
|
# * :script [String]
|
85
75
|
def get_titles(doc)
|
86
|
-
titles = [{ title_main: doc.
|
87
|
-
language: "zh", script: "Hans" }]
|
88
|
-
title_main = doc.
|
76
|
+
titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text,
|
77
|
+
title_intro: nil, language: "zh", script: "Hans" }]
|
78
|
+
title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
|
89
79
|
unless title_main.empty?
|
90
80
|
titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
|
91
81
|
end
|
92
82
|
titles
|
93
83
|
end
|
94
84
|
|
95
|
-
def get_type
|
96
|
-
"
|
85
|
+
def get_type
|
86
|
+
"standard"
|
97
87
|
end
|
98
88
|
|
99
89
|
# @param doc [Nokogiri::HTML::Document]
|
100
|
-
# @param
|
90
|
+
# @param status [String, NilClass]
|
101
91
|
# @return [RelatonBib::DocumentStatus]
|
102
|
-
def get_status(doc,
|
103
|
-
case doc.at(
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
when "废止"
|
109
|
-
stage = "obsoleted"
|
110
|
-
end
|
92
|
+
def get_status(doc, status = nil)
|
93
|
+
stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text
|
94
|
+
when "即将实施" then "published"
|
95
|
+
when "现行" then "activated"
|
96
|
+
when "废止" then "obsoleted"
|
97
|
+
end
|
111
98
|
RelatonBib::DocumentStatus.new stage: stage
|
112
99
|
end
|
113
100
|
|
114
101
|
private
|
115
102
|
|
116
103
|
# @param doc [Nokogiri::HTML::Document]
|
104
|
+
# @param ref [String]
|
117
105
|
# @return [Hash]
|
118
106
|
# * :scope [String]
|
119
107
|
# * :prefix [String]
|
120
108
|
# * :mandate [String]
|
121
|
-
def get_gbtype(doc)
|
122
|
-
ref = get_ref(doc)
|
109
|
+
def get_gbtype(doc, ref)
|
110
|
+
# ref = get_ref(doc)
|
123
111
|
{ scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
|
124
112
|
mandate: get_mandate(ref) }
|
125
113
|
end
|
126
114
|
|
127
115
|
# @param doc [Nokogiri::HTML::Document]
|
128
116
|
# @return [String]
|
129
|
-
def get_ref(doc)
|
130
|
-
|
131
|
-
end
|
117
|
+
# def get_ref(doc)
|
118
|
+
# doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
|
119
|
+
# end
|
132
120
|
|
133
121
|
# @param doc [Nokogiri::HTML::Document]
|
134
122
|
# @return [Array<String>]
|
135
123
|
def get_ccs(doc)
|
136
|
-
[doc
|
124
|
+
[doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
|
125
|
+
text.delete("\r\n\t\t")]
|
137
126
|
end
|
138
127
|
|
139
128
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -142,21 +131,21 @@ module RelatonGb
|
|
142
131
|
# * :group [String]
|
143
132
|
# * :subgroup [String]
|
144
133
|
def get_ics(doc)
|
145
|
-
ics = doc.
|
146
|
-
|
134
|
+
ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
|
135
|
+
" | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
|
136
|
+
return [] unless ics
|
147
137
|
|
148
|
-
field, group, subgroup = ics.text.split "."
|
138
|
+
field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
|
149
139
|
[{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
|
150
140
|
end
|
151
141
|
|
152
142
|
# @param doc [Nokogiri::HTML::Document]
|
153
143
|
# @return [String]
|
154
144
|
def get_scope(doc)
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
"sector"
|
145
|
+
issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
|
146
|
+
case issued&.text
|
147
|
+
when /国家标准/ then "national"
|
148
|
+
when /^行业标准/ then "sector"
|
160
149
|
end
|
161
150
|
end
|
162
151
|
|
@@ -170,8 +159,7 @@ module RelatonGb
|
|
170
159
|
# @param pref [String]
|
171
160
|
# @return [Hash{String=>String}]
|
172
161
|
def prefix(pref)
|
173
|
-
|
174
|
-
@prefixes ||= YAML.load_file(file_path)
|
162
|
+
@prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
|
175
163
|
@prefixes[pref]
|
176
164
|
end
|
177
165
|
|
@@ -190,8 +178,9 @@ module RelatonGb
|
|
190
178
|
# * :type [String] type of date
|
191
179
|
# * :on [String] date
|
192
180
|
def get_dates(doc)
|
193
|
-
date = doc.
|
194
|
-
|
181
|
+
date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
|
182
|
+
" | //dt[contains(text(), '发布日期')]/following-sibling::dd")
|
183
|
+
[{ type: "published", on: date.text.delete("\r\n\t\t") }]
|
195
184
|
end
|
196
185
|
end
|
197
186
|
end
|
@@ -18,42 +18,77 @@ module RelatonGb
|
|
18
18
|
# @param text [String] code of standard for serarch
|
19
19
|
# @return [RelatonGb::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
|
-
uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
# uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
22
|
+
uri = URI "http://hbba.sacinfo.org.cn/stdQueryList"
|
23
|
+
resp = Net::HTTP.post uri, URI.encode_www_form({ key: text })
|
24
|
+
# res = JSON.parse Net::HTTP.get(uri)
|
25
|
+
json = JSON.parse resp.body
|
26
|
+
hits = json["records"].map do |h|
|
27
|
+
Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self
|
25
28
|
end
|
29
|
+
# hits = res["rows"].map do |r|
|
30
|
+
# Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
|
31
|
+
# end
|
26
32
|
HitCollection.new hits
|
27
33
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
28
34
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
|
29
|
-
OpenSSL::SSL::SSLError
|
35
|
+
OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
|
30
36
|
raise RelatonBib::RequestError, "Cannot access #{uri}"
|
31
37
|
end
|
32
38
|
|
33
|
-
# @param
|
39
|
+
# @param hit [RelatonGb::Hit]
|
34
40
|
# @return [RelatonGb::GbBibliographicItem]
|
35
|
-
def scrape_doc(
|
36
|
-
src = "http://
|
41
|
+
def scrape_doc(hit)
|
42
|
+
src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}"
|
37
43
|
page_uri = URI src
|
38
44
|
doc = Nokogiri::HTML Net::HTTP.get(page_uri)
|
39
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
45
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
40
46
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
41
47
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
|
42
|
-
OpenSSL::SSL::SSLError
|
48
|
+
OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
|
43
49
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
44
50
|
end
|
45
51
|
|
46
52
|
private
|
47
53
|
|
48
54
|
# @param doc [Nokogiri::HTML::Document]
|
55
|
+
# @return [Array<Hash>]
|
56
|
+
# * :title_intro [String]
|
57
|
+
# * :title_main [String]
|
58
|
+
# * :language [String]
|
59
|
+
# * :script [String]
|
60
|
+
def get_titles(doc)
|
61
|
+
titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"),
|
62
|
+
title_intro: nil, language: "zh", script: "Hans" }]
|
63
|
+
# title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
|
64
|
+
# unless title_main.empty?
|
65
|
+
# titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
|
66
|
+
# end
|
67
|
+
titles
|
68
|
+
end
|
69
|
+
|
70
|
+
# @param _doc [Nokogiri::HTML::Document]
|
71
|
+
# @param ref [String]
|
49
72
|
# @return [Hash]
|
50
73
|
# * :type [String]
|
51
74
|
# * :name [String]
|
52
|
-
def get_committee(
|
53
|
-
ref = get_ref(doc)
|
75
|
+
def get_committee(_doc, ref)
|
76
|
+
# ref = get_ref(doc)
|
54
77
|
name = get_prefix(ref)["administration"]
|
55
78
|
{ type: "technical", name: name }
|
56
79
|
end
|
80
|
+
|
81
|
+
# @param _doc [Nokogiri::HTML::Document]
|
82
|
+
# @return [String]
|
83
|
+
def get_scope(_doc)
|
84
|
+
"sector"
|
85
|
+
end
|
86
|
+
|
87
|
+
# @param doc [Nokogiri::HTML::Document]
|
88
|
+
# @return [Array<String>]
|
89
|
+
def get_ccs(doc)
|
90
|
+
[doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text]
|
91
|
+
end
|
57
92
|
end
|
58
93
|
end
|
59
94
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require "open-uri"
|
5
|
+
require "net/http"
|
5
6
|
require "nokogiri"
|
6
7
|
require "relaton_gb/scrapper"
|
7
8
|
require "relaton_gb/gb_bibliographic_item"
|
@@ -21,13 +22,15 @@ module RelatonGb
|
|
21
22
|
search_html = OpenURI.open_uri(
|
22
23
|
"http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
|
23
24
|
CGI.escape(text.tr("-", [8212].pack("U"))),
|
24
|
-
)
|
25
|
+
).read
|
25
26
|
header = Nokogiri::HTML search_html
|
26
27
|
xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
|
27
|
-
t_xpath = "../preceding-sibling::td[
|
28
|
+
t_xpath = "../preceding-sibling::td[4]"
|
28
29
|
hits = header.xpath(xpath).map do |h|
|
29
|
-
|
30
|
-
|
30
|
+
docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
|
31
|
+
status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
|
32
|
+
pid = h[:href].sub(%r{\/$}, "")
|
33
|
+
Hit.new pid: pid, docref: docref, status: status, scrapper: self
|
31
34
|
end
|
32
35
|
HitCollection.new hits
|
33
36
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
@@ -35,12 +38,12 @@ module RelatonGb
|
|
35
38
|
end
|
36
39
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
37
40
|
|
38
|
-
# @param
|
41
|
+
# @param hit [RelatonGb::Hit] standard's page path
|
39
42
|
# @return [RelatonGb::GbBibliographicItem]
|
40
|
-
def scrape_doc(
|
41
|
-
src = "http://www.ttbz.org.cn#{pid}"
|
43
|
+
def scrape_doc(hit)
|
44
|
+
src = "http://www.ttbz.org.cn#{hit.pid}"
|
42
45
|
doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
|
43
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
46
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
44
47
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
45
48
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
46
49
|
end
|
@@ -49,16 +52,18 @@ module RelatonGb
|
|
49
52
|
|
50
53
|
# rubocop:disable Metrics/MethodLength
|
51
54
|
# @param doc [Nokogiri::HTML::Document]
|
55
|
+
# @param src [String]
|
56
|
+
# @param hit [RelatonGb::Hit]
|
52
57
|
# @return [Hash]
|
53
|
-
def scrapped_data(doc, src
|
54
|
-
docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
|
55
|
-
status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
|
58
|
+
def scrapped_data(doc, src, hit)
|
59
|
+
# docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
|
60
|
+
# status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
|
56
61
|
{
|
57
|
-
committee: get_committee(doc),
|
58
|
-
docid: get_docid(
|
62
|
+
committee: get_committee(doc, hit.docref),
|
63
|
+
docid: get_docid(hit.docref),
|
59
64
|
title: get_titles(doc),
|
60
|
-
type:
|
61
|
-
docstatus: get_status(doc,
|
65
|
+
type: get_type,
|
66
|
+
docstatus: get_status(doc, hit.status),
|
62
67
|
gbtype: gbtype,
|
63
68
|
ccs: get_ccs(doc),
|
64
69
|
ics: get_ics(doc),
|
@@ -66,12 +71,12 @@ module RelatonGb
|
|
66
71
|
date: get_dates(doc),
|
67
72
|
language: ["zh"],
|
68
73
|
script: ["Hans"],
|
69
|
-
structuredidentifier: fetch_structuredidentifier(
|
74
|
+
structuredidentifier: fetch_structuredidentifier(hit.docref),
|
70
75
|
}
|
71
76
|
end
|
72
77
|
# rubocop:enable Metrics/MethodLength
|
73
78
|
|
74
|
-
def get_committee(doc)
|
79
|
+
def get_committee(doc, _ref)
|
75
80
|
{
|
76
81
|
name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
|
77
82
|
type: "technical",
|
data/lib/relaton_gb/version.rb
CHANGED
data/lib/relaton_gb.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
require "relaton_gb/version"
|
2
2
|
require "relaton_gb/gb_bibliography"
|
3
3
|
|
4
|
-
if defined? Relaton
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
# if defined? Relaton
|
5
|
+
# require "relaton_gb/processor"
|
6
|
+
# # don't register the gem if it's required form relaton's registry
|
7
|
+
# return if caller.detect { |c| c.include? "register_gems" }
|
8
|
+
|
9
|
+
# Relaton::Registry.instance.register RelatonGb::Processor
|
10
|
+
# end
|
data/relaton_gb.gemspec
CHANGED
@@ -32,6 +32,8 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "rspec", "~> 3.0"
|
33
33
|
spec.add_development_dependency "ruby-debug-ide"
|
34
34
|
spec.add_development_dependency "simplecov"
|
35
|
+
spec.add_development_dependency "vcr"
|
36
|
+
spec.add_development_dependency "webmock"
|
35
37
|
|
36
38
|
spec.add_dependency "cnccs", "~> 0.1.1"
|
37
39
|
spec.add_dependency "gb-agencies", "~> 0.0.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-gb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,34 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: vcr
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: webmock
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
- !ruby/object:Gem::Dependency
|
126
154
|
name: cnccs
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -185,7 +213,6 @@ files:
|
|
185
213
|
- appveyor.yml
|
186
214
|
- bin/console
|
187
215
|
- bin/setup
|
188
|
-
- lib/relaton/processor.rb
|
189
216
|
- lib/relaton_gb.rb
|
190
217
|
- lib/relaton_gb/ccs.rb
|
191
218
|
- lib/relaton_gb/gb_bibliographic_item.rb
|
@@ -196,6 +223,7 @@ files:
|
|
196
223
|
- lib/relaton_gb/hash_converter.rb
|
197
224
|
- lib/relaton_gb/hit.rb
|
198
225
|
- lib/relaton_gb/hit_collection.rb
|
226
|
+
- lib/relaton_gb/processor.rb
|
199
227
|
- lib/relaton_gb/scrapper.rb
|
200
228
|
- lib/relaton_gb/sec_scrapper.rb
|
201
229
|
- lib/relaton_gb/t_scrapper.rb
|
data/lib/relaton/processor.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "relaton/processor"
|
4
|
-
|
5
|
-
module Relaton
|
6
|
-
module RelatonGb
|
7
|
-
class Processor < Relaton::Processor
|
8
|
-
def initialize
|
9
|
-
@short = :relaton_gb
|
10
|
-
@prefix = "CN"
|
11
|
-
@defaultprefix = %r{^GB }
|
12
|
-
@idtype = "Chinese Standard"
|
13
|
-
end
|
14
|
-
|
15
|
-
def get(code, date, opts)
|
16
|
-
::RelatonGb::GbBibliography.get(code, date, opts)
|
17
|
-
end
|
18
|
-
|
19
|
-
def from_xml(xml)
|
20
|
-
::RelatonGb::XMLParser.from_xml xml
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|