gbbib 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6eac6d0000a21b3dc52113ebda470cb6be672d2088e033648e090765559f0684
4
- data.tar.gz: 7fb3b022408232970d3812713d14de9f32eb8ed5c3de9cec00533effd188e5c0
3
+ metadata.gz: 58ab839578d5b3328e69d578d2eedd6f5d4ffd50d335640166c036e14ef1529a
4
+ data.tar.gz: 255fad3a6567026a7e9d84ed2b4cec4adca96338f55f5f2b6c615cd8770055d9
5
5
  SHA512:
6
- metadata.gz: e640701864d9f49b65a8b2842a3e6a29d0a6f4178f6598266cc95f690f9ac022fb4329a8e727e1c86d66369a64ab1e881523e51b30b2d1555f1af25a2bb88dd1
7
- data.tar.gz: 13ec0e6d6e4c15a3a3475fccf541bb3ab37027bfa1f712e544fb55d58d96e4285eb6d9fc6a6e8f972352063a3ee30b3ea6e1bfe4fa7fc5f771d647874d801b20
6
+ metadata.gz: ff92e79b4036741e3b14265b164ef4d425fb4364428ee0c7d44dfcbcc76c8afd41f22613f69bf5f329c486149ed10d72e554f595ab99c0e05bb9d3bc899a9450
7
+ data.tar.gz: 56a556ca5591226870f02938348c5b256c536cecd062d2f430a466bcdb3b50cefe07384fbb3b08cea771a07e6073a02c73828c45c4077c88d1bbd350d6764eb2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gbbib (0.4.2)
4
+ gbbib (0.4.3)
5
5
  cnccs (~> 0.1.1)
6
6
  gb-agencies (~> 0.0.1)
7
7
  iso-bib-item (~> 0.4.2)
@@ -18,22 +18,30 @@ module Gbbib
18
18
  # @param text [Strin] code of standard for serarch
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
- search_html = OpenURI.open_uri(
22
- 'http://www.std.gov.cn/search/stdPage?q=' + text
23
- )
24
- result = Nokogiri::HTML search_html
25
- hits = result.css('.s-title a').map do |h|
26
- Hit.new pid: h[:pid], title: h.text, scrapper: self
21
+ begin
22
+ search_html = OpenURI.open_uri(
23
+ 'http://www.std.gov.cn/search/stdPage?q=' + text
24
+ )
25
+ result = Nokogiri::HTML search_html
26
+ hits = result.css('.s-title a').map do |h|
27
+ Hit.new pid: h[:pid], title: h.text, scrapper: self
28
+ end
29
+ HitCollection.new hits
30
+ rescue
31
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
27
32
  end
28
- HitCollection.new hits
29
33
  end
30
34
 
31
35
  # @param pid [Strin] standard's page id
32
36
  # @return [Gbbib::GbBibliographicItem]
33
37
  def scrape_doc(pid)
34
38
  src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid
35
- doc = Nokogiri::HTML OpenURI.open_uri(src)
36
- GbBibliographicItem.new scrapped_data(doc, src: src)
39
+ begin
40
+ doc = Nokogiri::HTML OpenURI.open_uri(src)
41
+ GbBibliographicItem.new scrapped_data(doc, src: src)
42
+ rescue
43
+ warn "Cannot access http://www.std.gov.cn/search/stdPage"
44
+ end
37
45
  end
38
46
 
39
47
  # @param doc [Nokogiri::HTML]
@@ -42,7 +50,7 @@ module Gbbib
42
50
  # * :name [String]
43
51
  def get_committee(doc)
44
52
  name = doc.xpath('//p/a[1]/following-sibling::text()').text
45
- .match(/(?<=()[^)]+/).to_s
53
+ .match(/(?<=()[^)]+/).to_s
46
54
  { type: 'technical', name: name }
47
55
  end
48
56
  end
@@ -19,11 +19,15 @@ module Gbbib
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
21
  uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
- res = JSON.parse Net::HTTP.get(uri)
23
- hits = res['rows'].map do |r|
24
- Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
22
+ begin
23
+ res = JSON.parse Net::HTTP.get(uri)
24
+ hits = res['rows'].map do |r|
25
+ Hit.new pid: r['id'], title: r['STD_CODE'], scrapper: self
26
+ end
27
+ HitCollection.new hits
28
+ rescue
29
+ warn "Cannot access #{uri}"
25
30
  end
26
- HitCollection.new hits
27
31
  end
28
32
 
29
33
  # @param pid [String] standard's page id
@@ -31,8 +35,12 @@ module Gbbib
31
35
  def scrape_doc(pid)
32
36
  src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
33
37
  page_uri = URI src
34
- doc = Nokogiri::HTML Net::HTTP.get(page_uri)
35
- GbBibliographicItem.new scrapped_data(doc, src: src)
38
+ begin
39
+ doc = Nokogiri::HTML Net::HTTP.get(page_uri)
40
+ GbBibliographicItem.new scrapped_data(doc, src: src)
41
+ rescue
42
+ warn "Cannot access #{src}"
43
+ end
36
44
  end
37
45
 
38
46
  private
@@ -18,18 +18,22 @@ module Gbbib
18
18
  # @param text [String]
19
19
  # @return [Gbbib::HitCollection]
20
20
  def scrape_page(text)
21
- search_html = OpenURI.open_uri(
22
- 'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
23
- CGI.escape(text.tr('-', [8212].pack('U')))
24
- )
25
- header = Nokogiri::HTML search_html
26
- xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
- t_xpath = '../preceding-sibling::td[3]'
28
- hits = header.xpath(xpath).map do |h|
29
- title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
30
- Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
21
+ begin
22
+ search_html = OpenURI.open_uri(
23
+ 'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
24
+ CGI.escape(text.tr('-', [8212].pack('U')))
25
+ )
26
+ header = Nokogiri::HTML search_html
27
+ xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
28
+ t_xpath = '../preceding-sibling::td[3]'
29
+ hits = header.xpath(xpath).map do |h|
30
+ title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
31
+ Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
32
+ end
33
+ HitCollection.new hits
34
+ rescue
35
+ warn "Cannot connect to #{http://www.ttbz.org.cn/Home/Standard}"
31
36
  end
32
- HitCollection.new hits
33
37
  end
34
38
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
35
39
 
@@ -37,8 +41,12 @@ module Gbbib
37
41
  # @return [Gbbib::GbBibliographicItem]
38
42
  def scrape_doc(pid)
39
43
  src = "http://www.ttbz.org.cn#{pid}"
40
- doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
41
- GbBibliographicItem.new scrapped_data(doc, src: src)
44
+ begin
45
+ doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
46
+ GbBibliographicItem.new scrapped_data(doc, src: src)
47
+ rescue
48
+ warn "Cannot connect to #{src}"
49
+ end
42
50
  end
43
51
 
44
52
  private
@@ -96,7 +104,7 @@ module Gbbib
96
104
 
97
105
  def get_ccs(doc)
98
106
  [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
99
- .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
107
+ .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
100
108
  end
101
109
 
102
110
  def get_ics(doc)
@@ -108,7 +116,7 @@ module Gbbib
108
116
 
109
117
  def get_dates(doc)
110
118
  d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
111
- .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
119
+ .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
112
120
  [{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
113
121
  end
114
122
  end
data/lib/gbbib/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Gbbib
4
- VERSION = '0.4.2'
4
+ VERSION = '0.4.3'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gbbib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-27 00:00:00.000000000 Z
11
+ date: 2018-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler