gbbib 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6eac6d0000a21b3dc52113ebda470cb6be672d2088e033648e090765559f0684
4
- data.tar.gz: 7fb3b022408232970d3812713d14de9f32eb8ed5c3de9cec00533effd188e5c0
3
+ metadata.gz: 58ab839578d5b3328e69d578d2eedd6f5d4ffd50d335640166c036e14ef1529a
4
+ data.tar.gz: 255fad3a6567026a7e9d84ed2b4cec4adca96338f55f5f2b6c615cd8770055d9
5
5
  SHA512:
6
- metadata.gz: e640701864d9f49b65a8b2842a3e6a29d0a6f4178f6598266cc95f690f9ac022fb4329a8e727e1c86d66369a64ab1e881523e51b30b2d1555f1af25a2bb88dd1
7
- data.tar.gz: 13ec0e6d6e4c15a3a3475fccf541bb3ab37027bfa1f712e544fb55d58d96e4285eb6d9fc6a6e8f972352063a3ee30b3ea6e1bfe4fa7fc5f771d647874d801b20
6
+ metadata.gz: ff92e79b4036741e3b14265b164ef4d425fb4364428ee0c7d44dfcbcc76c8afd41f22613f69bf5f329c486149ed10d72e554f595ab99c0e05bb9d3bc899a9450
7
+ data.tar.gz: 56a556ca5591226870f02938348c5b256c536cecd062d2f430a466bcdb3b50cefe07384fbb3b08cea771a07e6073a02c73828c45c4077c88d1bbd350d6764eb2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gbbib (0.4.2)
4
+ gbbib (0.4.3)
5
5
  cnccs (~> 0.1.1)
6
6
  gb-agencies (~> 0.0.1)
7
7
  iso-bib-item (~> 0.4.2)
@@ -18,22 +18,30 @@ module Gbbib
18
18
  # @param text [Strin] code of standard for serarch
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
- search_html = OpenURI.open_uri(
22
- 'http://www.std.gov.cn/search/stdPage?q=' + text
23
- )
24
- result = Nokogiri::HTML search_html
25
- hits = result.css('.s-title a').map do |h|
26
- Hit.new pid: h[:pid], title: h.text, scrapper: self
21
+ begin
22
+ search_html = OpenURI.open_uri(
23
+ 'http://www.std.gov.cn/search/stdPage?q=' + text
24
+ )
25
+ result = Nokogiri::HTML search_html
26
+ hits = result.css('.s-title a').map do |h|
27
+ Hit.new pid: h[:pid], title: h.text, scrapper: self
28
+ end
29
+ HitCollection.new hits
30
+ rescue
31
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
27
32
  end
28
- HitCollection.new hits
29
33
  end
30
34
 
31
35
  # @param pid [Strin] standard's page id
32
36
  # @return [Gbbib::GbBibliographicItem]
33
37
  def scrape_doc(pid)
34
38
  src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
35
- doc = Nokogiri::HTML OpenURI.open_uri(src)
36
- GbBibliographicItem.new scrapped_data(doc, src: src)
39
+ begin
40
+ doc = Nokogiri::HTML OpenURI.open_uri(src)
41
+ GbBibliographicItem.new scrapped_data(doc, src: src)
42
+ rescue
43
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
44
+ end
37
45
  end
38
46
 
39
47
  # @param doc [Nokogiri::HTML]
@@ -42,7 +50,7 @@ module Gbbib
42
50
  # * :name [String]
43
51
  def get_committee(doc)
44
52
  name = doc.xpath('//p/a[1]/following-sibling::text()').text
45
- .match(/(?<=()[^)]+/).to_s
53
+ .match(/(?<=()[^)]+/).to_s
46
54
  { type: 'technical', name: name }
47
55
  end
48
56
  end
@@ -19,11 +19,15 @@ module Gbbib
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
21
  uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
- res = JSON.parse Net::HTTP.get(uri)
23
- hits = res['rows'].map do |r|
24
- Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
22
+ begin
23
+ res = JSON.parse Net::HTTP.get(uri)
24
+ hits = res['rows'].map do |r|
25
+ Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
26
+ end
27
+ HitCollection.new hits
28
+ rescue
29
+ warn "Cannot access #{uri}"
25
30
  end
26
- HitCollection.new hits
27
31
  end
28
32
 
29
33
  # @param pid [String] standard's page id
@@ -31,8 +35,12 @@ module Gbbib
31
35
  def scrape_doc(pid)
32
36
  src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
33
37
  page_uri = URI src
34
- doc = Nokogiri::HTML Net::HTTP.get(page_uri)
35
- GbBibliographicItem.new scrapped_data(doc, src: src)
38
+ begin
39
+ doc = Nokogiri::HTML Net::HTTP.get(page_uri)
40
+ GbBibliographicItem.new scrapped_data(doc, src: src)
41
+ rescue
42
+ warn "Cannot access #{src}"
43
+ end
36
44
  end
37
45
 
38
46
  private
@@ -18,18 +18,22 @@ module Gbbib
18
18
  # @param text [String]
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
- search_html = OpenURI.open_uri(
22
- 'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
23
- CGI.escape(text.tr('-', [8212].pack('U')))
24
- )
25
- header = Nokogiri::HTML search_html
26
- xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
- t_xpath = '../preceding-sibling::td[3]'
28
- hits = header.xpath(xpath).map do |h|
29
- title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
30
- Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
21
+ begin
22
+ search_html = OpenURI.open_uri(
23
+ 'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
24
+ CGI.escape(text.tr('-', [8212].pack('U')))
25
+ )
26
+ header = Nokogiri::HTML search_html
27
+ xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
28
+ t_xpath = '../preceding-sibling::td[3]'
29
+ hits = header.xpath(xpath).map do |h|
30
+ title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
31
+ Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
32
+ end
33
+ HitCollection.new hits
34
+ rescue
35
+ warn "Cannot connect to #{http://www.ttbz.org.cn/Home/Standard}"
31
36
  end
32
- HitCollection.new hits
33
37
  end
34
38
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
35
39
 
@@ -37,8 +41,12 @@ module Gbbib
37
41
  # @return [Gbbib::GbBibliographicItem]
38
42
  def scrape_doc(pid)
39
43
  src = "http://www.ttbz.org.cn#{pid}"
40
- doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
41
- GbBibliographicItem.new scrapped_data(doc, src: src)
44
+ begin
45
+ doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
46
+ GbBibliographicItem.new scrapped_data(doc, src: src)
47
+ rescue
48
+ warn "Cannot connect to #{src}"
49
+ end
42
50
  end
43
51
 
44
52
  private
@@ -96,7 +104,7 @@ module Gbbib
96
104
 
97
105
  def get_ccs(doc)
98
106
  [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
99
- .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
107
+ .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
100
108
  end
101
109
 
102
110
  def get_ics(doc)
@@ -108,7 +116,7 @@ module Gbbib
108
116
 
109
117
  def get_dates(doc)
110
118
  d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
111
- .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
119
+ .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
112
120
  [{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
113
121
  end
114
122
  end
data/lib/gbbib/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Gbbib
4
- VERSION = '0.4.2'
4
+ VERSION = '0.4.3'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gbbib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-27 00:00:00.000000000 Z
11
+ date: 2018-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler