gbbib 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/gbbib/gb_scrapper.rb +18 -10
- data/lib/gbbib/sec_scrapper.rb +14 -6
- data/lib/gbbib/t_scrapper.rb +23 -15
- data/lib/gbbib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58ab839578d5b3328e69d578d2eedd6f5d4ffd50d335640166c036e14ef1529a
|
4
|
+
data.tar.gz: 255fad3a6567026a7e9d84ed2b4cec4adca96338f55f5f2b6c615cd8770055d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff92e79b4036741e3b14265b164ef4d425fb4364428ee0c7d44dfcbcc76c8afd41f22613f69bf5f329c486149ed10d72e554f595ab99c0e05bb9d3bc899a9450
|
7
|
+
data.tar.gz: 56a556ca5591226870f02938348c5b256c536cecd062d2f430a466bcdb3b50cefe07384fbb3b08cea771a07e6073a02c73828c45c4077c88d1bbd350d6764eb2
|
data/Gemfile.lock
CHANGED
data/lib/gbbib/gb_scrapper.rb
CHANGED
@@ -18,22 +18,30 @@ module Gbbib
|
|
18
18
|
# @param text [Strin] code of standard for serarch
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
begin
|
22
|
+
search_html = OpenURI.open_uri(
|
23
|
+
'http://www.std.gov.cn/search/stdPage?q=' + text
|
24
|
+
)
|
25
|
+
result = Nokogiri::HTML search_html
|
26
|
+
hits = result.css('.s-title a').map do |h|
|
27
|
+
Hit.new pid: h[:pid], title: h.text, scrapper: self
|
28
|
+
end
|
29
|
+
HitCollection.new hits
|
30
|
+
rescue
|
31
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
27
32
|
end
|
28
|
-
HitCollection.new hits
|
29
33
|
end
|
30
34
|
|
31
35
|
# @param pid [Strin] standard's page id
|
32
36
|
# @return [Gbbib::GbBibliographicItem]
|
33
37
|
def scrape_doc(pid)
|
34
38
|
src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
|
35
|
-
|
36
|
-
|
39
|
+
begin
|
40
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
41
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
42
|
+
rescue
|
43
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
44
|
+
end
|
37
45
|
end
|
38
46
|
|
39
47
|
# @param doc [Nokogiri::HTML]
|
@@ -42,7 +50,7 @@ module Gbbib
|
|
42
50
|
# * :name [String]
|
43
51
|
def get_committee(doc)
|
44
52
|
name = doc.xpath('//p/a[1]/following-sibling::text()').text
|
45
|
-
|
53
|
+
.match(/(?<=()[^)]+/).to_s
|
46
54
|
{ type: 'technical', name: name }
|
47
55
|
end
|
48
56
|
end
|
data/lib/gbbib/sec_scrapper.rb
CHANGED
@@ -19,11 +19,15 @@ module Gbbib
|
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
21
|
uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
begin
|
23
|
+
res = JSON.parse Net::HTTP.get(uri)
|
24
|
+
hits = res['rows'].map do |r|
|
25
|
+
Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
|
26
|
+
end
|
27
|
+
HitCollection.new hits
|
28
|
+
rescue
|
29
|
+
warn "Cannot access #{uri}"
|
25
30
|
end
|
26
|
-
HitCollection.new hits
|
27
31
|
end
|
28
32
|
|
29
33
|
# @param pid [String] standard's page id
|
@@ -31,8 +35,12 @@ module Gbbib
|
|
31
35
|
def scrape_doc(pid)
|
32
36
|
src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
|
33
37
|
page_uri = URI src
|
34
|
-
|
35
|
-
|
38
|
+
begin
|
39
|
+
doc = Nokogiri::HTML Net::HTTP.get(page_uri)
|
40
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
41
|
+
rescue
|
42
|
+
warn "Cannot access #{src}"
|
43
|
+
end
|
36
44
|
end
|
37
45
|
|
38
46
|
private
|
data/lib/gbbib/t_scrapper.rb
CHANGED
@@ -18,18 +18,22 @@ module Gbbib
|
|
18
18
|
# @param text [String]
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
begin
|
22
|
+
search_html = OpenURI.open_uri(
|
23
|
+
'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
|
24
|
+
CGI.escape(text.tr('-', [8212].pack('U')))
|
25
|
+
)
|
26
|
+
header = Nokogiri::HTML search_html
|
27
|
+
xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
|
28
|
+
t_xpath = '../preceding-sibling::td[3]'
|
29
|
+
hits = header.xpath(xpath).map do |h|
|
30
|
+
title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
|
31
|
+
Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
|
32
|
+
end
|
33
|
+
HitCollection.new hits
|
34
|
+
rescue
|
35
|
+
warn "Cannot connect to #{http://www.ttbz.org.cn/Home/Standard}"
|
31
36
|
end
|
32
|
-
HitCollection.new hits
|
33
37
|
end
|
34
38
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
35
39
|
|
@@ -37,8 +41,12 @@ module Gbbib
|
|
37
41
|
# @return [Gbbib::GbBibliographicItem]
|
38
42
|
def scrape_doc(pid)
|
39
43
|
src = "http://www.ttbz.org.cn#{pid}"
|
40
|
-
|
41
|
-
|
44
|
+
begin
|
45
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
|
46
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
47
|
+
rescue
|
48
|
+
warn "Cannot connect to #{src}"
|
49
|
+
end
|
42
50
|
end
|
43
51
|
|
44
52
|
private
|
@@ -96,7 +104,7 @@ module Gbbib
|
|
96
104
|
|
97
105
|
def get_ccs(doc)
|
98
106
|
[doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
|
99
|
-
|
107
|
+
.text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
|
100
108
|
end
|
101
109
|
|
102
110
|
def get_ics(doc)
|
@@ -108,7 +116,7 @@ module Gbbib
|
|
108
116
|
|
109
117
|
def get_dates(doc)
|
110
118
|
d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
|
111
|
-
|
119
|
+
.text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
|
112
120
|
[{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
|
113
121
|
end
|
114
122
|
end
|
data/lib/gbbib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gbbib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|