relaton-gb 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -7
- data/lib/relaton_gb/gb_bibliography.rb +9 -11
- data/lib/relaton_gb/gb_scrapper.rb +18 -13
- data/lib/relaton_gb/hit.rb +19 -25
- data/lib/relaton_gb/processor.rb +35 -0
- data/lib/relaton_gb/scrapper.rb +55 -66
- data/lib/relaton_gb/sec_scrapper.rb +47 -12
- data/lib/relaton_gb/t_scrapper.rb +22 -17
- data/lib/relaton_gb/version.rb +1 -1
- data/lib/relaton_gb.rb +7 -4
- data/relaton_gb.gemspec +2 -0
- metadata +31 -3
- data/lib/relaton/processor.rb +0 -24
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 64d418169f8f5f6c500a43e913ea94d13d4dca7d
|
|
4
|
+
data.tar.gz: 2717d29fdc48172660535462cade3eb856dcf362
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e372dd88445fdb97b2bda4c41b755bb8b325ade7c271f0db0e567c697c7b4c80b6d1da24c7af1e8439fee87e9e9cf1b2ff2940e1dd86d5146c1efa73ec048e4b
|
|
7
|
+
data.tar.gz: ee2e5d7edc8f836505a024bff24e543fa70a05d9508885a3ead36b798a0750aa661a825f3f33250b00740a78b5edfb02d5610d616902aa7899d3ea75046e7594
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
relaton-gb (0.6.
|
|
4
|
+
relaton-gb (0.6.3)
|
|
5
5
|
cnccs (~> 0.1.1)
|
|
6
6
|
gb-agencies (~> 0.0.1)
|
|
7
7
|
relaton-iso-bib (~> 0.3.0)
|
|
@@ -9,11 +9,13 @@ PATH
|
|
|
9
9
|
GEM
|
|
10
10
|
remote: https://rubygems.org/
|
|
11
11
|
specs:
|
|
12
|
-
addressable (2.
|
|
13
|
-
public_suffix (>= 2.0.2, <
|
|
12
|
+
addressable (2.7.0)
|
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
|
14
14
|
byebug (11.0.1)
|
|
15
15
|
cnccs (0.1.3)
|
|
16
16
|
coderay (1.1.2)
|
|
17
|
+
crack (0.4.3)
|
|
18
|
+
safe_yaml (~> 1.0.0)
|
|
17
19
|
debase (0.2.2)
|
|
18
20
|
debase-ruby_core_source (>= 0.10.2)
|
|
19
21
|
debase-ruby_core_source (0.10.5)
|
|
@@ -22,6 +24,7 @@ GEM
|
|
|
22
24
|
equivalent-xml (0.6.0)
|
|
23
25
|
nokogiri (>= 1.4.3)
|
|
24
26
|
gb-agencies (0.0.5)
|
|
27
|
+
hashdiff (1.0.0)
|
|
25
28
|
isoics (0.1.7)
|
|
26
29
|
json (2.2.0)
|
|
27
30
|
method_source (0.9.2)
|
|
@@ -34,12 +37,12 @@ GEM
|
|
|
34
37
|
pry-byebug (3.7.0)
|
|
35
38
|
byebug (~> 11.0)
|
|
36
39
|
pry (~> 0.10)
|
|
37
|
-
public_suffix (
|
|
40
|
+
public_suffix (4.0.1)
|
|
38
41
|
rake (10.5.0)
|
|
39
|
-
relaton-bib (0.3.
|
|
42
|
+
relaton-bib (0.3.6)
|
|
40
43
|
addressable
|
|
41
|
-
nokogiri
|
|
42
|
-
relaton-iso-bib (0.3.
|
|
44
|
+
nokogiri
|
|
45
|
+
relaton-iso-bib (0.3.5)
|
|
43
46
|
isoics (~> 0.1.6)
|
|
44
47
|
relaton-bib (~> 0.3.0)
|
|
45
48
|
ruby_deep_clone (~> 0.8.0)
|
|
@@ -59,11 +62,17 @@ GEM
|
|
|
59
62
|
ruby-debug-ide (0.7.0)
|
|
60
63
|
rake (>= 0.8.1)
|
|
61
64
|
ruby_deep_clone (0.8.0)
|
|
65
|
+
safe_yaml (1.0.5)
|
|
62
66
|
simplecov (0.16.1)
|
|
63
67
|
docile (~> 1.1)
|
|
64
68
|
json (>= 1.8, < 3)
|
|
65
69
|
simplecov-html (~> 0.10.0)
|
|
66
70
|
simplecov-html (0.10.2)
|
|
71
|
+
vcr (5.0.0)
|
|
72
|
+
webmock (3.7.0)
|
|
73
|
+
addressable (>= 2.3.6)
|
|
74
|
+
crack (>= 0.3.2)
|
|
75
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
|
67
76
|
|
|
68
77
|
PLATFORMS
|
|
69
78
|
ruby
|
|
@@ -78,6 +87,8 @@ DEPENDENCIES
|
|
|
78
87
|
rspec (~> 3.0)
|
|
79
88
|
ruby-debug-ide
|
|
80
89
|
simplecov
|
|
90
|
+
vcr
|
|
91
|
+
webmock
|
|
81
92
|
|
|
82
93
|
BUNDLED WITH
|
|
83
94
|
2.0.1
|
|
@@ -91,16 +91,11 @@ module RelatonGb
|
|
|
91
91
|
def search_filter(code)
|
|
92
92
|
# search filter needs to incorporate year
|
|
93
93
|
docidrx = %r{^[^\s]+\s[\d\.-]+}
|
|
94
|
-
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
|
95
94
|
warn "fetching #{code}..."
|
|
96
95
|
result = search(code)
|
|
97
|
-
|
|
98
|
-
hit.
|
|
99
|
-
# !corrigrx =~ hit.title
|
|
96
|
+
result.select do |hit|
|
|
97
|
+
hit.docref && hit.docref.match(docidrx).to_s.include?(code)
|
|
100
98
|
end
|
|
101
|
-
return ret unless ret.empty?
|
|
102
|
-
|
|
103
|
-
[]
|
|
104
99
|
end
|
|
105
100
|
|
|
106
101
|
# Sort through the results from Isobib, fetching them three at a time,
|
|
@@ -125,12 +120,15 @@ module RelatonGb
|
|
|
125
120
|
{ years: missed_years }
|
|
126
121
|
end
|
|
127
122
|
|
|
128
|
-
|
|
129
|
-
|
|
123
|
+
# @param hits [RelatonBib::HitCollection<RelatonBib::Hit>]
|
|
124
|
+
# @param threads [Integer]
|
|
125
|
+
# @return [Array<RelatonBib::GbBibliographicItem>]
|
|
126
|
+
def fetch_pages(hits, threads)
|
|
127
|
+
workers = RelatonBib::WorkersPool.new threads
|
|
130
128
|
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
|
131
|
-
|
|
129
|
+
hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
|
132
130
|
workers.end
|
|
133
|
-
workers.result.
|
|
131
|
+
workers.result.sort_by { |x| x[:i] }.map { |x| x[:hit] }
|
|
134
132
|
end
|
|
135
133
|
end
|
|
136
134
|
end
|
|
@@ -16,35 +16,40 @@ module RelatonGb
|
|
|
16
16
|
# @return [RelatonGb::HitCollection]
|
|
17
17
|
def scrape_page(text)
|
|
18
18
|
search_html = OpenURI.open_uri(
|
|
19
|
-
"http://
|
|
19
|
+
"http://openstd.samr.gov.cn/bzgk/gb/std_list?p.p2=" + text
|
|
20
20
|
)
|
|
21
21
|
result = Nokogiri::HTML search_html
|
|
22
|
-
hits = result.
|
|
23
|
-
|
|
22
|
+
hits = result.xpath(
|
|
23
|
+
"//table[contains(@class, 'result_list')]/tbody[2]/tr",
|
|
24
|
+
).map do |h|
|
|
25
|
+
ref = h.at "./td[2]/a"
|
|
26
|
+
pid = ref[:onclick].match(/[0-9A-F]+/).to_s
|
|
27
|
+
rdate = h.at("./td[7]").text
|
|
28
|
+
Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
|
|
24
29
|
end
|
|
25
|
-
HitCollection.new hits
|
|
30
|
+
HitCollection.new hits.sort_by(&:release_date).reverse
|
|
26
31
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
|
27
|
-
raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/
|
|
32
|
+
raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/bzgk/gb/std_list"
|
|
28
33
|
end
|
|
29
34
|
|
|
30
|
-
# @param
|
|
35
|
+
# @param hit [RelatonGb::Hit] standard's page id
|
|
31
36
|
# @return [RelatonGb::GbBibliographicItem]
|
|
32
|
-
def scrape_doc(
|
|
33
|
-
src = "http://
|
|
37
|
+
def scrape_doc(hit)
|
|
38
|
+
src = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" + hit.pid
|
|
34
39
|
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
|
35
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
|
40
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
|
36
41
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
|
37
42
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
|
38
43
|
end
|
|
39
44
|
|
|
40
45
|
# @param doc [Nokogiri::HTML]
|
|
46
|
+
# @param _ref [String]
|
|
41
47
|
# @return [Hash]
|
|
42
48
|
# * :type [String]
|
|
43
49
|
# * :name [String]
|
|
44
|
-
def get_committee(doc)
|
|
45
|
-
name = doc.
|
|
46
|
-
|
|
47
|
-
{ type: "technical", name: name }
|
|
50
|
+
def get_committee(doc, _ref)
|
|
51
|
+
name = doc.at("//div[contains(text(), '归口单位')]/following-sibling::div")
|
|
52
|
+
{ type: "technical", name: name.text.delete("\r\n\t\t") }
|
|
48
53
|
end
|
|
49
54
|
end
|
|
50
55
|
end
|
data/lib/relaton_gb/hit.rb
CHANGED
|
@@ -7,28 +7,36 @@ module RelatonGb
|
|
|
7
7
|
attr_reader :hit_collection
|
|
8
8
|
|
|
9
9
|
# @return [String]
|
|
10
|
-
attr_reader :pid
|
|
10
|
+
attr_reader :pid, :docref
|
|
11
11
|
|
|
12
|
-
# @return [
|
|
13
|
-
attr_reader :
|
|
12
|
+
# @return [Date, NilClass]
|
|
13
|
+
attr_reader :release_date
|
|
14
|
+
|
|
15
|
+
# @return [String, NilClass]
|
|
16
|
+
attr_reader :status
|
|
14
17
|
|
|
15
18
|
# @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
|
16
19
|
attr_reader :scrapper
|
|
17
20
|
|
|
18
|
-
# @param
|
|
19
|
-
# @param
|
|
20
|
-
|
|
21
|
+
# @param pid [String]
|
|
22
|
+
# @param docref [String]
|
|
23
|
+
# @parma scrapper [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
|
|
24
|
+
# @param release_date [String]
|
|
25
|
+
# @status [String, NilClass]
|
|
26
|
+
# @param hit_collection [RelatonGb:HitCollection, NilClass]
|
|
27
|
+
def initialize(pid:, docref:, scrapper:, **args)
|
|
21
28
|
@pid = pid
|
|
22
|
-
@
|
|
23
|
-
@hit_collection = hit_collection
|
|
29
|
+
@docref = docref
|
|
24
30
|
@scrapper = scrapper
|
|
25
|
-
|
|
31
|
+
@release_date = Date.parse args[:release_date] if args[:release_date]
|
|
32
|
+
@status = args[:status]
|
|
33
|
+
@hit_collection = args[:hit_collection]
|
|
26
34
|
end
|
|
27
35
|
|
|
28
36
|
# Parse page.
|
|
29
37
|
# @return [Isobib::IsoBibliographicItem]
|
|
30
38
|
def fetch
|
|
31
|
-
@fetch ||= scrapper.scrape_doc
|
|
39
|
+
@fetch ||= scrapper.scrape_doc self
|
|
32
40
|
end
|
|
33
41
|
|
|
34
42
|
# @return [String]
|
|
@@ -40,21 +48,7 @@ module RelatonGb
|
|
|
40
48
|
def inspect
|
|
41
49
|
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
|
42
50
|
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
|
43
|
-
"@
|
|
51
|
+
"@docref=\"#{docref}\">"
|
|
44
52
|
end
|
|
45
|
-
|
|
46
|
-
# @param builder [Nokogiri::XML::Builder]
|
|
47
|
-
# @param opts [Hash]
|
|
48
|
-
# @return [String]
|
|
49
|
-
# def to_xml(builder = nil, opts = {})
|
|
50
|
-
# if builder
|
|
51
|
-
# fetch.to_xml builder, opts
|
|
52
|
-
# else
|
|
53
|
-
# builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
|
54
|
-
# fetch.to_xml xml, opts
|
|
55
|
-
# end
|
|
56
|
-
# builder.doc.root.to_xml
|
|
57
|
-
# end
|
|
58
|
-
# end
|
|
59
53
|
end
|
|
60
54
|
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "relaton/processor"
|
|
4
|
+
|
|
5
|
+
module RelatonGb
|
|
6
|
+
class Processor < Relaton::Processor
|
|
7
|
+
def initialize
|
|
8
|
+
@short = :relaton_gb
|
|
9
|
+
@prefix = "CN"
|
|
10
|
+
@defaultprefix = %r{^GB }
|
|
11
|
+
@idtype = "Chinese Standard"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @param code [String]
|
|
15
|
+
# @param date [String, NilClass] year
|
|
16
|
+
# @param opts [Hash]
|
|
17
|
+
# @return [RelatonGb::GbBibliographicItem]
|
|
18
|
+
def get(code, date, opts)
|
|
19
|
+
::RelatonGb::GbBibliography.get(code, date, opts)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param xml [String]
|
|
23
|
+
# @return [RelatonGb::GbBibliographicItem]
|
|
24
|
+
def from_xml(xml)
|
|
25
|
+
::RelatonGb::XMLParser.from_xml xml
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @param hash [Hash]
|
|
29
|
+
# @return [RelatonGb::GbBibliographicItem]
|
|
30
|
+
def hash_to_bib(hash)
|
|
31
|
+
item_hash = ::RelatonGb::HashConverter.hash_to_bib(hash)
|
|
32
|
+
::RelatonGb::GbBibliographicItem.new item_hash
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
data/lib/relaton_gb/scrapper.rb
CHANGED
|
@@ -11,64 +11,54 @@ module RelatonGb
|
|
|
11
11
|
|
|
12
12
|
# rubocop:disable Metrics/MethodLength
|
|
13
13
|
# @param doc [Nokogiri::HTML::Document]
|
|
14
|
-
# @param src [String]
|
|
14
|
+
# @param src [String]
|
|
15
|
+
# @param hit [RelatonGb::Hit]
|
|
15
16
|
# @return [Hash]
|
|
16
|
-
def scrapped_data(doc, src
|
|
17
|
+
def scrapped_data(doc, src, hit)
|
|
17
18
|
{
|
|
18
|
-
committee: get_committee(doc),
|
|
19
|
-
docid: get_docid(
|
|
19
|
+
committee: get_committee(doc, hit.docref),
|
|
20
|
+
docid: get_docid(hit.docref),
|
|
20
21
|
title: get_titles(doc),
|
|
21
|
-
contributor: get_contributors(doc),
|
|
22
|
-
type: get_type
|
|
23
|
-
docstatus: get_status(doc),
|
|
24
|
-
gbtype: get_gbtype(doc),
|
|
22
|
+
contributor: get_contributors(doc, hit.docref),
|
|
23
|
+
type: get_type,
|
|
24
|
+
docstatus: get_status(doc, hit.status),
|
|
25
|
+
gbtype: get_gbtype(doc, hit.docref),
|
|
25
26
|
ccs: get_ccs(doc),
|
|
26
27
|
ics: get_ics(doc),
|
|
27
28
|
link: [{ type: "src", content: src }],
|
|
28
29
|
date: get_dates(doc),
|
|
29
30
|
language: ["zh"],
|
|
30
31
|
script: ["Hans"],
|
|
31
|
-
structuredidentifier: fetch_structuredidentifier(
|
|
32
|
+
structuredidentifier: fetch_structuredidentifier(hit.docref),
|
|
32
33
|
}
|
|
33
34
|
end
|
|
34
35
|
# rubocop:enable Metrics/MethodLength
|
|
35
36
|
|
|
36
|
-
# @param
|
|
37
|
-
# @param xpt [String]
|
|
37
|
+
# @param docref [String]
|
|
38
38
|
# @return [Array<RelatonBib::DocumentIdentifier>]
|
|
39
|
-
def get_docid(
|
|
40
|
-
|
|
41
|
-
return [] unless item_ref
|
|
42
|
-
|
|
43
|
-
[RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
|
|
39
|
+
def get_docid(docref)
|
|
40
|
+
[RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
|
|
44
41
|
end
|
|
45
42
|
|
|
46
|
-
# @param
|
|
47
|
-
# @param xpt [String]
|
|
43
|
+
# @param docref [String]
|
|
48
44
|
# @return [RelatonIsoBib::StructuredIdentifier]
|
|
49
|
-
def fetch_structuredidentifier(
|
|
50
|
-
|
|
51
|
-
unless item_ref
|
|
52
|
-
return RelatonIsoBib::StructuredIdentifier.new(
|
|
53
|
-
project_number: "?", part_number: "?", prefix: nil, id: "?",
|
|
54
|
-
type: "Chinese Standard"
|
|
55
|
-
)
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
|
|
59
|
-
# prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
|
|
45
|
+
def fetch_structuredidentifier(docref)
|
|
46
|
+
m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
|
|
60
47
|
RelatonIsoBib::StructuredIdentifier.new(
|
|
61
48
|
project_number: m[1], part_number: m[2], prefix: nil,
|
|
62
|
-
id:
|
|
49
|
+
id: docref, type: "Chinese Standard"
|
|
63
50
|
)
|
|
64
51
|
end
|
|
65
52
|
|
|
66
|
-
|
|
53
|
+
# @param doc [Nokogiri::HTML::Document]
|
|
54
|
+
# @param docref [Strings]
|
|
55
|
+
# @return [Array<Hash>]
|
|
56
|
+
def get_contributors(doc, docref)
|
|
67
57
|
gb_en = GbAgencies::Agencies.new("en", {}, "")
|
|
68
58
|
gb_zh = GbAgencies::Agencies.new("zh", {}, "")
|
|
69
|
-
name =
|
|
59
|
+
name = docref.match(/^[^\s]+/).to_s
|
|
70
60
|
name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
|
|
71
|
-
gbtype = get_gbtype(doc)
|
|
61
|
+
gbtype = get_gbtype(doc, docref)
|
|
72
62
|
entity = RelatonBib::Organization.new name: [
|
|
73
63
|
{ language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
|
|
74
64
|
{ language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
|
|
@@ -83,57 +73,56 @@ module RelatonGb
|
|
|
83
73
|
# * :language [String]
|
|
84
74
|
# * :script [String]
|
|
85
75
|
def get_titles(doc)
|
|
86
|
-
titles = [{ title_main: doc.
|
|
87
|
-
language: "zh", script: "Hans" }]
|
|
88
|
-
title_main = doc.
|
|
76
|
+
titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text,
|
|
77
|
+
title_intro: nil, language: "zh", script: "Hans" }]
|
|
78
|
+
title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
|
|
89
79
|
unless title_main.empty?
|
|
90
80
|
titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
|
|
91
81
|
end
|
|
92
82
|
titles
|
|
93
83
|
end
|
|
94
84
|
|
|
95
|
-
def get_type
|
|
96
|
-
"
|
|
85
|
+
def get_type
|
|
86
|
+
"standard"
|
|
97
87
|
end
|
|
98
88
|
|
|
99
89
|
# @param doc [Nokogiri::HTML::Document]
|
|
100
|
-
# @param
|
|
90
|
+
# @param status [String, NilClass]
|
|
101
91
|
# @return [RelatonBib::DocumentStatus]
|
|
102
|
-
def get_status(doc,
|
|
103
|
-
case doc.at(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
when "废止"
|
|
109
|
-
stage = "obsoleted"
|
|
110
|
-
end
|
|
92
|
+
def get_status(doc, status = nil)
|
|
93
|
+
stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text
|
|
94
|
+
when "即将实施" then "published"
|
|
95
|
+
when "现行" then "activated"
|
|
96
|
+
when "废止" then "obsoleted"
|
|
97
|
+
end
|
|
111
98
|
RelatonBib::DocumentStatus.new stage: stage
|
|
112
99
|
end
|
|
113
100
|
|
|
114
101
|
private
|
|
115
102
|
|
|
116
103
|
# @param doc [Nokogiri::HTML::Document]
|
|
104
|
+
# @param ref [String]
|
|
117
105
|
# @return [Hash]
|
|
118
106
|
# * :scope [String]
|
|
119
107
|
# * :prefix [String]
|
|
120
108
|
# * :mandate [String]
|
|
121
|
-
def get_gbtype(doc)
|
|
122
|
-
ref = get_ref(doc)
|
|
109
|
+
def get_gbtype(doc, ref)
|
|
110
|
+
# ref = get_ref(doc)
|
|
123
111
|
{ scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
|
|
124
112
|
mandate: get_mandate(ref) }
|
|
125
113
|
end
|
|
126
114
|
|
|
127
115
|
# @param doc [Nokogiri::HTML::Document]
|
|
128
116
|
# @return [String]
|
|
129
|
-
def get_ref(doc)
|
|
130
|
-
|
|
131
|
-
end
|
|
117
|
+
# def get_ref(doc)
|
|
118
|
+
# doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
|
|
119
|
+
# end
|
|
132
120
|
|
|
133
121
|
# @param doc [Nokogiri::HTML::Document]
|
|
134
122
|
# @return [Array<String>]
|
|
135
123
|
def get_ccs(doc)
|
|
136
|
-
[doc
|
|
124
|
+
[doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
|
|
125
|
+
text.delete("\r\n\t\t")]
|
|
137
126
|
end
|
|
138
127
|
|
|
139
128
|
# @param doc [Nokogiri::HTML::Document]
|
|
@@ -142,21 +131,21 @@ module RelatonGb
|
|
|
142
131
|
# * :group [String]
|
|
143
132
|
# * :subgroup [String]
|
|
144
133
|
def get_ics(doc)
|
|
145
|
-
ics = doc.
|
|
146
|
-
|
|
134
|
+
ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
|
|
135
|
+
" | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
|
|
136
|
+
return [] unless ics
|
|
147
137
|
|
|
148
|
-
field, group, subgroup = ics.text.split "."
|
|
138
|
+
field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
|
|
149
139
|
[{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
|
|
150
140
|
end
|
|
151
141
|
|
|
152
142
|
# @param doc [Nokogiri::HTML::Document]
|
|
153
143
|
# @return [String]
|
|
154
144
|
def get_scope(doc)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
"sector"
|
|
145
|
+
issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
|
|
146
|
+
case issued&.text
|
|
147
|
+
when /国家标准/ then "national"
|
|
148
|
+
when /^行业标准/ then "sector"
|
|
160
149
|
end
|
|
161
150
|
end
|
|
162
151
|
|
|
@@ -170,8 +159,7 @@ module RelatonGb
|
|
|
170
159
|
# @param pref [String]
|
|
171
160
|
# @return [Hash{String=>String}]
|
|
172
161
|
def prefix(pref)
|
|
173
|
-
|
|
174
|
-
@prefixes ||= YAML.load_file(file_path)
|
|
162
|
+
@prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
|
|
175
163
|
@prefixes[pref]
|
|
176
164
|
end
|
|
177
165
|
|
|
@@ -190,8 +178,9 @@ module RelatonGb
|
|
|
190
178
|
# * :type [String] type of date
|
|
191
179
|
# * :on [String] date
|
|
192
180
|
def get_dates(doc)
|
|
193
|
-
date = doc.
|
|
194
|
-
|
|
181
|
+
date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
|
|
182
|
+
" | //dt[contains(text(), '发布日期')]/following-sibling::dd")
|
|
183
|
+
[{ type: "published", on: date.text.delete("\r\n\t\t") }]
|
|
195
184
|
end
|
|
196
185
|
end
|
|
197
186
|
end
|
|
@@ -18,42 +18,77 @@ module RelatonGb
|
|
|
18
18
|
# @param text [String] code of standard for serarch
|
|
19
19
|
# @return [RelatonGb::HitCollection]
|
|
20
20
|
def scrape_page(text)
|
|
21
|
-
uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
# uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
|
22
|
+
uri = URI "http://hbba.sacinfo.org.cn/stdQueryList"
|
|
23
|
+
resp = Net::HTTP.post uri, URI.encode_www_form({ key: text })
|
|
24
|
+
# res = JSON.parse Net::HTTP.get(uri)
|
|
25
|
+
json = JSON.parse resp.body
|
|
26
|
+
hits = json["records"].map do |h|
|
|
27
|
+
Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self
|
|
25
28
|
end
|
|
29
|
+
# hits = res["rows"].map do |r|
|
|
30
|
+
# Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
|
|
31
|
+
# end
|
|
26
32
|
HitCollection.new hits
|
|
27
33
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
|
28
34
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
|
|
29
|
-
OpenSSL::SSL::SSLError
|
|
35
|
+
OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
|
|
30
36
|
raise RelatonBib::RequestError, "Cannot access #{uri}"
|
|
31
37
|
end
|
|
32
38
|
|
|
33
|
-
# @param
|
|
39
|
+
# @param hit [RelatonGb::Hit]
|
|
34
40
|
# @return [RelatonGb::GbBibliographicItem]
|
|
35
|
-
def scrape_doc(
|
|
36
|
-
src = "http://
|
|
41
|
+
def scrape_doc(hit)
|
|
42
|
+
src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}"
|
|
37
43
|
page_uri = URI src
|
|
38
44
|
doc = Nokogiri::HTML Net::HTTP.get(page_uri)
|
|
39
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
|
45
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
|
40
46
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
|
41
47
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
|
|
42
|
-
OpenSSL::SSL::SSLError
|
|
48
|
+
OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
|
|
43
49
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
|
44
50
|
end
|
|
45
51
|
|
|
46
52
|
private
|
|
47
53
|
|
|
48
54
|
# @param doc [Nokogiri::HTML::Document]
|
|
55
|
+
# @return [Array<Hash>]
|
|
56
|
+
# * :title_intro [String]
|
|
57
|
+
# * :title_main [String]
|
|
58
|
+
# * :language [String]
|
|
59
|
+
# * :script [String]
|
|
60
|
+
def get_titles(doc)
|
|
61
|
+
titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"),
|
|
62
|
+
title_intro: nil, language: "zh", script: "Hans" }]
|
|
63
|
+
# title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
|
|
64
|
+
# unless title_main.empty?
|
|
65
|
+
# titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
|
|
66
|
+
# end
|
|
67
|
+
titles
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# @param _doc [Nokogiri::HTML::Document]
|
|
71
|
+
# @param ref [String]
|
|
49
72
|
# @return [Hash]
|
|
50
73
|
# * :type [String]
|
|
51
74
|
# * :name [String]
|
|
52
|
-
def get_committee(
|
|
53
|
-
ref = get_ref(doc)
|
|
75
|
+
def get_committee(_doc, ref)
|
|
76
|
+
# ref = get_ref(doc)
|
|
54
77
|
name = get_prefix(ref)["administration"]
|
|
55
78
|
{ type: "technical", name: name }
|
|
56
79
|
end
|
|
80
|
+
|
|
81
|
+
# @param _doc [Nokogiri::HTML::Document]
|
|
82
|
+
# @return [String]
|
|
83
|
+
def get_scope(_doc)
|
|
84
|
+
"sector"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @param doc [Nokogiri::HTML::Document]
|
|
88
|
+
# @return [Array<String>]
|
|
89
|
+
def get_ccs(doc)
|
|
90
|
+
[doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text]
|
|
91
|
+
end
|
|
57
92
|
end
|
|
58
93
|
end
|
|
59
94
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
4
|
require "open-uri"
|
|
5
|
+
require "net/http"
|
|
5
6
|
require "nokogiri"
|
|
6
7
|
require "relaton_gb/scrapper"
|
|
7
8
|
require "relaton_gb/gb_bibliographic_item"
|
|
@@ -21,13 +22,15 @@ module RelatonGb
|
|
|
21
22
|
search_html = OpenURI.open_uri(
|
|
22
23
|
"http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
|
|
23
24
|
CGI.escape(text.tr("-", [8212].pack("U"))),
|
|
24
|
-
)
|
|
25
|
+
).read
|
|
25
26
|
header = Nokogiri::HTML search_html
|
|
26
27
|
xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
|
|
27
|
-
t_xpath = "../preceding-sibling::td[
|
|
28
|
+
t_xpath = "../preceding-sibling::td[4]"
|
|
28
29
|
hits = header.xpath(xpath).map do |h|
|
|
29
|
-
|
|
30
|
-
|
|
30
|
+
docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
|
|
31
|
+
status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
|
|
32
|
+
pid = h[:href].sub(%r{\/$}, "")
|
|
33
|
+
Hit.new pid: pid, docref: docref, status: status, scrapper: self
|
|
31
34
|
end
|
|
32
35
|
HitCollection.new hits
|
|
33
36
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
|
@@ -35,12 +38,12 @@ module RelatonGb
|
|
|
35
38
|
end
|
|
36
39
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
|
37
40
|
|
|
38
|
-
# @param
|
|
41
|
+
# @param hit [RelatonGb::Hit] standard's page path
|
|
39
42
|
# @return [RelatonGb::GbBibliographicItem]
|
|
40
|
-
def scrape_doc(
|
|
41
|
-
src = "http://www.ttbz.org.cn#{pid}"
|
|
43
|
+
def scrape_doc(hit)
|
|
44
|
+
src = "http://www.ttbz.org.cn#{hit.pid}"
|
|
42
45
|
doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
|
|
43
|
-
GbBibliographicItem.new scrapped_data(doc, src
|
|
46
|
+
GbBibliographicItem.new scrapped_data(doc, src, hit)
|
|
44
47
|
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
|
|
45
48
|
raise RelatonBib::RequestError, "Cannot access #{src}"
|
|
46
49
|
end
|
|
@@ -49,16 +52,18 @@ module RelatonGb
|
|
|
49
52
|
|
|
50
53
|
# rubocop:disable Metrics/MethodLength
|
|
51
54
|
# @param doc [Nokogiri::HTML::Document]
|
|
55
|
+
# @param src [String]
|
|
56
|
+
# @param hit [RelatonGb::Hit]
|
|
52
57
|
# @return [Hash]
|
|
53
|
-
def scrapped_data(doc, src
|
|
54
|
-
docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
|
|
55
|
-
status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
|
|
58
|
+
def scrapped_data(doc, src, hit)
|
|
59
|
+
# docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
|
|
60
|
+
# status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
|
|
56
61
|
{
|
|
57
|
-
committee: get_committee(doc),
|
|
58
|
-
docid: get_docid(
|
|
62
|
+
committee: get_committee(doc, hit.docref),
|
|
63
|
+
docid: get_docid(hit.docref),
|
|
59
64
|
title: get_titles(doc),
|
|
60
|
-
type:
|
|
61
|
-
docstatus: get_status(doc,
|
|
65
|
+
type: get_type,
|
|
66
|
+
docstatus: get_status(doc, hit.status),
|
|
62
67
|
gbtype: gbtype,
|
|
63
68
|
ccs: get_ccs(doc),
|
|
64
69
|
ics: get_ics(doc),
|
|
@@ -66,12 +71,12 @@ module RelatonGb
|
|
|
66
71
|
date: get_dates(doc),
|
|
67
72
|
language: ["zh"],
|
|
68
73
|
script: ["Hans"],
|
|
69
|
-
structuredidentifier: fetch_structuredidentifier(
|
|
74
|
+
structuredidentifier: fetch_structuredidentifier(hit.docref),
|
|
70
75
|
}
|
|
71
76
|
end
|
|
72
77
|
# rubocop:enable Metrics/MethodLength
|
|
73
78
|
|
|
74
|
-
def get_committee(doc)
|
|
79
|
+
def get_committee(doc, _ref)
|
|
75
80
|
{
|
|
76
81
|
name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
|
|
77
82
|
type: "technical",
|
data/lib/relaton_gb/version.rb
CHANGED
data/lib/relaton_gb.rb
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
require "relaton_gb/version"
|
|
2
2
|
require "relaton_gb/gb_bibliography"
|
|
3
3
|
|
|
4
|
-
if defined? Relaton
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
# if defined? Relaton
|
|
5
|
+
# require "relaton_gb/processor"
|
|
6
|
+
# # don't register the gem if it's required form relaton's registry
|
|
7
|
+
# return if caller.detect { |c| c.include? "register_gems" }
|
|
8
|
+
|
|
9
|
+
# Relaton::Registry.instance.register RelatonGb::Processor
|
|
10
|
+
# end
|
data/relaton_gb.gemspec
CHANGED
|
@@ -32,6 +32,8 @@ Gem::Specification.new do |spec|
|
|
|
32
32
|
spec.add_development_dependency "rspec", "~> 3.0"
|
|
33
33
|
spec.add_development_dependency "ruby-debug-ide"
|
|
34
34
|
spec.add_development_dependency "simplecov"
|
|
35
|
+
spec.add_development_dependency "vcr"
|
|
36
|
+
spec.add_development_dependency "webmock"
|
|
35
37
|
|
|
36
38
|
spec.add_dependency "cnccs", "~> 0.1.1"
|
|
37
39
|
spec.add_dependency "gb-agencies", "~> 0.0.1"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-gb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-
|
|
11
|
+
date: 2019-09-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -122,6 +122,34 @@ dependencies:
|
|
|
122
122
|
- - ">="
|
|
123
123
|
- !ruby/object:Gem::Version
|
|
124
124
|
version: '0'
|
|
125
|
+
- !ruby/object:Gem::Dependency
|
|
126
|
+
name: vcr
|
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
|
128
|
+
requirements:
|
|
129
|
+
- - ">="
|
|
130
|
+
- !ruby/object:Gem::Version
|
|
131
|
+
version: '0'
|
|
132
|
+
type: :development
|
|
133
|
+
prerelease: false
|
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
+
requirements:
|
|
136
|
+
- - ">="
|
|
137
|
+
- !ruby/object:Gem::Version
|
|
138
|
+
version: '0'
|
|
139
|
+
- !ruby/object:Gem::Dependency
|
|
140
|
+
name: webmock
|
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
|
142
|
+
requirements:
|
|
143
|
+
- - ">="
|
|
144
|
+
- !ruby/object:Gem::Version
|
|
145
|
+
version: '0'
|
|
146
|
+
type: :development
|
|
147
|
+
prerelease: false
|
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
+
requirements:
|
|
150
|
+
- - ">="
|
|
151
|
+
- !ruby/object:Gem::Version
|
|
152
|
+
version: '0'
|
|
125
153
|
- !ruby/object:Gem::Dependency
|
|
126
154
|
name: cnccs
|
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -185,7 +213,6 @@ files:
|
|
|
185
213
|
- appveyor.yml
|
|
186
214
|
- bin/console
|
|
187
215
|
- bin/setup
|
|
188
|
-
- lib/relaton/processor.rb
|
|
189
216
|
- lib/relaton_gb.rb
|
|
190
217
|
- lib/relaton_gb/ccs.rb
|
|
191
218
|
- lib/relaton_gb/gb_bibliographic_item.rb
|
|
@@ -196,6 +223,7 @@ files:
|
|
|
196
223
|
- lib/relaton_gb/hash_converter.rb
|
|
197
224
|
- lib/relaton_gb/hit.rb
|
|
198
225
|
- lib/relaton_gb/hit_collection.rb
|
|
226
|
+
- lib/relaton_gb/processor.rb
|
|
199
227
|
- lib/relaton_gb/scrapper.rb
|
|
200
228
|
- lib/relaton_gb/sec_scrapper.rb
|
|
201
229
|
- lib/relaton_gb/t_scrapper.rb
|
data/lib/relaton/processor.rb
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "relaton/processor"
|
|
4
|
-
|
|
5
|
-
module Relaton
|
|
6
|
-
module RelatonGb
|
|
7
|
-
class Processor < Relaton::Processor
|
|
8
|
-
def initialize
|
|
9
|
-
@short = :relaton_gb
|
|
10
|
-
@prefix = "CN"
|
|
11
|
-
@defaultprefix = %r{^GB }
|
|
12
|
-
@idtype = "Chinese Standard"
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
def get(code, date, opts)
|
|
16
|
-
::RelatonGb::GbBibliography.get(code, date, opts)
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
def from_xml(xml)
|
|
20
|
-
::RelatonGb::XMLParser.from_xml xml
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|