gbbib 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/gbbib/gb_scrapper.rb +18 -10
- data/lib/gbbib/sec_scrapper.rb +14 -6
- data/lib/gbbib/t_scrapper.rb +23 -15
- data/lib/gbbib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58ab839578d5b3328e69d578d2eedd6f5d4ffd50d335640166c036e14ef1529a
|
4
|
+
data.tar.gz: 255fad3a6567026a7e9d84ed2b4cec4adca96338f55f5f2b6c615cd8770055d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff92e79b4036741e3b14265b164ef4d425fb4364428ee0c7d44dfcbcc76c8afd41f22613f69bf5f329c486149ed10d72e554f595ab99c0e05bb9d3bc899a9450
|
7
|
+
data.tar.gz: 56a556ca5591226870f02938348c5b256c536cecd062d2f430a466bcdb3b50cefe07384fbb3b08cea771a07e6073a02c73828c45c4077c88d1bbd350d6764eb2
|
data/Gemfile.lock
CHANGED
data/lib/gbbib/gb_scrapper.rb
CHANGED
@@ -18,22 +18,30 @@ module Gbbib
|
|
18
18
|
# @param text [Strin] code of standard for serarch
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
begin
|
22
|
+
search_html = OpenURI.open_uri(
|
23
|
+
'http://www.std.gov.cn/search/stdPage?q=' + text
|
24
|
+
)
|
25
|
+
result = Nokogiri::HTML search_html
|
26
|
+
hits = result.css('.s-title a').map do |h|
|
27
|
+
Hit.new pid: h[:pid], title: h.text, scrapper: self
|
28
|
+
end
|
29
|
+
HitCollection.new hits
|
30
|
+
rescue
|
31
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
27
32
|
end
|
28
|
-
HitCollection.new hits
|
29
33
|
end
|
30
34
|
|
31
35
|
# @param pid [Strin] standard's page id
|
32
36
|
# @return [Gbbib::GbBibliographicItem]
|
33
37
|
def scrape_doc(pid)
|
34
38
|
src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
|
35
|
-
|
36
|
-
|
39
|
+
begin
|
40
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src)
|
41
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
42
|
+
rescue
|
43
|
+
warn "Cannot access http://www.std.gov.cn/search/stdPage"
|
44
|
+
end
|
37
45
|
end
|
38
46
|
|
39
47
|
# @param doc [Nokogiri::HTML]
|
@@ -42,7 +50,7 @@ module Gbbib
|
|
42
50
|
# * :name [String]
|
43
51
|
def get_committee(doc)
|
44
52
|
name = doc.xpath('//p/a[1]/following-sibling::text()').text
|
45
|
-
|
53
|
+
.match(/(?<=()[^)]+/).to_s
|
46
54
|
{ type: 'technical', name: name }
|
47
55
|
end
|
48
56
|
end
|
data/lib/gbbib/sec_scrapper.rb
CHANGED
@@ -19,11 +19,15 @@ module Gbbib
|
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
21
|
uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
begin
|
23
|
+
res = JSON.parse Net::HTTP.get(uri)
|
24
|
+
hits = res['rows'].map do |r|
|
25
|
+
Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
|
26
|
+
end
|
27
|
+
HitCollection.new hits
|
28
|
+
rescue
|
29
|
+
warn "Cannot access #{uri}"
|
25
30
|
end
|
26
|
-
HitCollection.new hits
|
27
31
|
end
|
28
32
|
|
29
33
|
# @param pid [String] standard's page id
|
@@ -31,8 +35,12 @@ module Gbbib
|
|
31
35
|
def scrape_doc(pid)
|
32
36
|
src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
|
33
37
|
page_uri = URI src
|
34
|
-
|
35
|
-
|
38
|
+
begin
|
39
|
+
doc = Nokogiri::HTML Net::HTTP.get(page_uri)
|
40
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
41
|
+
rescue
|
42
|
+
warn "Cannot access #{src}"
|
43
|
+
end
|
36
44
|
end
|
37
45
|
|
38
46
|
private
|
data/lib/gbbib/t_scrapper.rb
CHANGED
@@ -18,18 +18,22 @@ module Gbbib
|
|
18
18
|
# @param text [String]
|
19
19
|
# @return [Gbbib::HitCollection]
|
20
20
|
def scrape_page(text)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
begin
|
22
|
+
search_html = OpenURI.open_uri(
|
23
|
+
'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
|
24
|
+
CGI.escape(text.tr('-', [8212].pack('U')))
|
25
|
+
)
|
26
|
+
header = Nokogiri::HTML search_html
|
27
|
+
xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
|
28
|
+
t_xpath = '../preceding-sibling::td[3]'
|
29
|
+
hits = header.xpath(xpath).map do |h|
|
30
|
+
title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
|
31
|
+
Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
|
32
|
+
end
|
33
|
+
HitCollection.new hits
|
34
|
+
rescue
|
35
|
+
warn "Cannot connect to #{http://www.ttbz.org.cn/Home/Standard}"
|
31
36
|
end
|
32
|
-
HitCollection.new hits
|
33
37
|
end
|
34
38
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
35
39
|
|
@@ -37,8 +41,12 @@ module Gbbib
|
|
37
41
|
# @return [Gbbib::GbBibliographicItem]
|
38
42
|
def scrape_doc(pid)
|
39
43
|
src = "http://www.ttbz.org.cn#{pid}"
|
40
|
-
|
41
|
-
|
44
|
+
begin
|
45
|
+
doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
|
46
|
+
GbBibliographicItem.new scrapped_data(doc, src: src)
|
47
|
+
rescue
|
48
|
+
warn "Cannot connect to #{src}"
|
49
|
+
end
|
42
50
|
end
|
43
51
|
|
44
52
|
private
|
@@ -96,7 +104,7 @@ module Gbbib
|
|
96
104
|
|
97
105
|
def get_ccs(doc)
|
98
106
|
[doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
|
99
|
-
|
107
|
+
.text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
|
100
108
|
end
|
101
109
|
|
102
110
|
def get_ics(doc)
|
@@ -108,7 +116,7 @@ module Gbbib
|
|
108
116
|
|
109
117
|
def get_dates(doc)
|
110
118
|
d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
|
111
|
-
|
119
|
+
.text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
|
112
120
|
[{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
|
113
121
|
end
|
114
122
|
end
|
data/lib/gbbib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gbbib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|