relaton-gb 0.6.2 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c0978ce217698027cca98e48fcad3fdf3060af8f
4
- data.tar.gz: 3f9e4857d33b1b3aee204746098c003816ded415
3
+ metadata.gz: 64d418169f8f5f6c500a43e913ea94d13d4dca7d
4
+ data.tar.gz: 2717d29fdc48172660535462cade3eb856dcf362
5
5
  SHA512:
6
- metadata.gz: 6a52a32b85031f0b31005a562fa3c95e630f9a33e353b7b0371eee23ebb8483430cf770bc16f900406357609ecbabae3dcb8dab1af6e2ac5a9151866bb03ceb0
7
- data.tar.gz: 9d3ab095b5681268b8113789b53c0b60140e4faf427918f29561460454fec99b9cff46bb9d39d5b6480df341f7d23396c17bcb8258fa95f413965a7f136f4e96
6
+ metadata.gz: e372dd88445fdb97b2bda4c41b755bb8b325ade7c271f0db0e567c697c7b4c80b6d1da24c7af1e8439fee87e9e9cf1b2ff2940e1dd86d5146c1efa73ec048e4b
7
+ data.tar.gz: ee2e5d7edc8f836505a024bff24e543fa70a05d9508885a3ead36b798a0750aa661a825f3f33250b00740a78b5edfb02d5610d616902aa7899d3ea75046e7594
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- relaton-gb (0.6.2)
4
+ relaton-gb (0.6.3)
5
5
  cnccs (~> 0.1.1)
6
6
  gb-agencies (~> 0.0.1)
7
7
  relaton-iso-bib (~> 0.3.0)
@@ -9,11 +9,13 @@ PATH
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
12
+ addressable (2.7.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
14
  byebug (11.0.1)
15
15
  cnccs (0.1.3)
16
16
  coderay (1.1.2)
17
+ crack (0.4.3)
18
+ safe_yaml (~> 1.0.0)
17
19
  debase (0.2.2)
18
20
  debase-ruby_core_source (>= 0.10.2)
19
21
  debase-ruby_core_source (0.10.5)
@@ -22,6 +24,7 @@ GEM
22
24
  equivalent-xml (0.6.0)
23
25
  nokogiri (>= 1.4.3)
24
26
  gb-agencies (0.0.5)
27
+ hashdiff (1.0.0)
25
28
  isoics (0.1.7)
26
29
  json (2.2.0)
27
30
  method_source (0.9.2)
@@ -34,12 +37,12 @@ GEM
34
37
  pry-byebug (3.7.0)
35
38
  byebug (~> 11.0)
36
39
  pry (~> 0.10)
37
- public_suffix (3.1.1)
40
+ public_suffix (4.0.1)
38
41
  rake (10.5.0)
39
- relaton-bib (0.3.5)
42
+ relaton-bib (0.3.6)
40
43
  addressable
41
- nokogiri (~> 1.10)
42
- relaton-iso-bib (0.3.4)
44
+ nokogiri
45
+ relaton-iso-bib (0.3.5)
43
46
  isoics (~> 0.1.6)
44
47
  relaton-bib (~> 0.3.0)
45
48
  ruby_deep_clone (~> 0.8.0)
@@ -59,11 +62,17 @@ GEM
59
62
  ruby-debug-ide (0.7.0)
60
63
  rake (>= 0.8.1)
61
64
  ruby_deep_clone (0.8.0)
65
+ safe_yaml (1.0.5)
62
66
  simplecov (0.16.1)
63
67
  docile (~> 1.1)
64
68
  json (>= 1.8, < 3)
65
69
  simplecov-html (~> 0.10.0)
66
70
  simplecov-html (0.10.2)
71
+ vcr (5.0.0)
72
+ webmock (3.7.0)
73
+ addressable (>= 2.3.6)
74
+ crack (>= 0.3.2)
75
+ hashdiff (>= 0.4.0, < 2.0.0)
67
76
 
68
77
  PLATFORMS
69
78
  ruby
@@ -78,6 +87,8 @@ DEPENDENCIES
78
87
  rspec (~> 3.0)
79
88
  ruby-debug-ide
80
89
  simplecov
90
+ vcr
91
+ webmock
81
92
 
82
93
  BUNDLED WITH
83
94
  2.0.1
@@ -91,16 +91,11 @@ module RelatonGb
91
91
  def search_filter(code)
92
92
  # search filter needs to incorporate year
93
93
  docidrx = %r{^[^\s]+\s[\d\.-]+}
94
- # corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
95
94
  warn "fetching #{code}..."
96
95
  result = search(code)
97
- ret = result.select do |hit|
98
- hit.title && hit.title.match(docidrx).to_s == code # &&
99
- # !corrigrx =~ hit.title
96
+ result.select do |hit|
97
+ hit.docref && hit.docref.match(docidrx).to_s.include?(code)
100
98
  end
101
- return ret unless ret.empty?
102
-
103
- []
104
99
  end
105
100
 
106
101
  # Sort through the results from Isobib, fetching them three at a time,
@@ -125,12 +120,15 @@ module RelatonGb
125
120
  { years: missed_years }
126
121
  end
127
122
 
128
- def fetch_pages(s, n)
129
- workers = RelatonBib::WorkersPool.new n
123
+ # @param hits [RelatonBib::HitCollection<RelatonBib::Hit>]
124
+ # @param threads [Integer]
125
+ # @return [Array<RelatonBib::GbBibliographicItem>]
126
+ def fetch_pages(hits, threads)
127
+ workers = RelatonBib::WorkersPool.new threads
130
128
  workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
131
- s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
129
+ hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
132
130
  workers.end
133
- workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
131
+ workers.result.sort_by { |x| x[:i] }.map { |x| x[:hit] }
134
132
  end
135
133
  end
136
134
  end
@@ -16,35 +16,40 @@ module RelatonGb
16
16
  # @return [RelatonGb::HitCollection]
17
17
  def scrape_page(text)
18
18
  search_html = OpenURI.open_uri(
19
- "http://www.std.gov.cn/search/stdPage?q=" + text
19
+ "http://openstd.samr.gov.cn/bzgk/gb/std_list?p.p2=" + text
20
20
  )
21
21
  result = Nokogiri::HTML search_html
22
- hits = result.css(".s-title a").map do |h|
23
- Hit.new pid: h[:pid], title: h.text, scrapper: self
22
+ hits = result.xpath(
23
+ "//table[contains(@class, 'result_list')]/tbody[2]/tr",
24
+ ).map do |h|
25
+ ref = h.at "./td[2]/a"
26
+ pid = ref[:onclick].match(/[0-9A-F]+/).to_s
27
+ rdate = h.at("./td[7]").text
28
+ Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
24
29
  end
25
- HitCollection.new hits
30
+ HitCollection.new hits.sort_by(&:release_date).reverse
26
31
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
27
- raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/search/stdPage"
32
+ raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/bzgk/gb/std_list"
28
33
  end
29
34
 
30
- # @param pid [Strin] standard's page id
35
+ # @param hit [RelatonGb::Hit] standard's page id
31
36
  # @return [RelatonGb::GbBibliographicItem]
32
- def scrape_doc(pid)
33
- src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
37
+ def scrape_doc(hit)
38
+ src = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" + hit.pid
34
39
  doc = Nokogiri::HTML OpenURI.open_uri(src)
35
- GbBibliographicItem.new scrapped_data(doc, src: src)
40
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
36
41
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
37
42
  raise RelatonBib::RequestError, "Cannot access #{src}"
38
43
  end
39
44
 
40
45
  # @param doc [Nokogiri::HTML]
46
+ # @param _ref [String]
41
47
  # @return [Hash]
42
48
  # * :type [String]
43
49
  # * :name [String]
44
- def get_committee(doc)
45
- name = doc.xpath("//p/a[1]/following-sibling::text()").text.
46
- match(/(?<=()[^)]+/).to_s
47
- { type: "technical", name: name }
50
+ def get_committee(doc, _ref)
51
+ name = doc.at("//div[contains(text(), '归口单位')]/following-sibling::div")
52
+ { type: "technical", name: name.text.delete("\r\n\t\t") }
48
53
  end
49
54
  end
50
55
  end
@@ -7,28 +7,36 @@ module RelatonGb
7
7
  attr_reader :hit_collection
8
8
 
9
9
  # @return [String]
10
- attr_reader :pid
10
+ attr_reader :pid, :docref
11
11
 
12
- # @return [String]
13
- attr_reader :title
12
+ # @return [Date, NilClass]
13
+ attr_reader :release_date
14
+
15
+ # @return [String, NilClass]
16
+ attr_reader :status
14
17
 
15
18
  # @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
16
19
  attr_reader :scrapper
17
20
 
18
- # @param hit [Hash]
19
- # @param hit_collection [Isobib:HitCollection]
20
- def initialize(pid:, title:, hit_collection: nil, scrapper:)
21
+ # @param pid [String]
22
+ # @param docref [String]
23
+ # @parma scrapper [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
24
+ # @param release_date [String]
25
+ # @status [String, NilClass]
26
+ # @param hit_collection [RelatonGb:HitCollection, NilClass]
27
+ def initialize(pid:, docref:, scrapper:, **args)
21
28
  @pid = pid
22
- @title = title
23
- @hit_collection = hit_collection
29
+ @docref = docref
24
30
  @scrapper = scrapper
25
- self.hit_collection << self if hit_collection
31
+ @release_date = Date.parse args[:release_date] if args[:release_date]
32
+ @status = args[:status]
33
+ @hit_collection = args[:hit_collection]
26
34
  end
27
35
 
28
36
  # Parse page.
29
37
  # @return [Isobib::IsoBibliographicItem]
30
38
  def fetch
31
- @fetch ||= scrapper.scrape_doc pid
39
+ @fetch ||= scrapper.scrape_doc self
32
40
  end
33
41
 
34
42
  # @return [String]
@@ -40,21 +48,7 @@ module RelatonGb
40
48
  def inspect
41
49
  "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
42
50
  "@fullIdentifier=\"#{@fetch&.shortref}\" "\
43
- "@title=\"#{title}\">"
51
+ "@docref=\"#{docref}\">"
44
52
  end
45
-
46
- # @param builder [Nokogiri::XML::Builder]
47
- # @param opts [Hash]
48
- # @return [String]
49
- # def to_xml(builder = nil, opts = {})
50
- # if builder
51
- # fetch.to_xml builder, opts
52
- # else
53
- # builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
54
- # fetch.to_xml xml, opts
55
- # end
56
- # builder.doc.root.to_xml
57
- # end
58
- # end
59
53
  end
60
54
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton/processor"
4
+
5
+ module RelatonGb
6
+ class Processor < Relaton::Processor
7
+ def initialize
8
+ @short = :relaton_gb
9
+ @prefix = "CN"
10
+ @defaultprefix = %r{^GB }
11
+ @idtype = "Chinese Standard"
12
+ end
13
+
14
+ # @param code [String]
15
+ # @param date [String, NilClass] year
16
+ # @param opts [Hash]
17
+ # @return [RelatonGb::GbBibliographicItem]
18
+ def get(code, date, opts)
19
+ ::RelatonGb::GbBibliography.get(code, date, opts)
20
+ end
21
+
22
+ # @param xml [String]
23
+ # @return [RelatonGb::GbBibliographicItem]
24
+ def from_xml(xml)
25
+ ::RelatonGb::XMLParser.from_xml xml
26
+ end
27
+
28
+ # @param hash [Hash]
29
+ # @return [RelatonGb::GbBibliographicItem]
30
+ def hash_to_bib(hash)
31
+ item_hash = ::RelatonGb::HashConverter.hash_to_bib(hash)
32
+ ::RelatonGb::GbBibliographicItem.new item_hash
33
+ end
34
+ end
35
+ end
@@ -11,64 +11,54 @@ module RelatonGb
11
11
 
12
12
  # rubocop:disable Metrics/MethodLength
13
13
  # @param doc [Nokogiri::HTML::Document]
14
- # @param src [String] url of scrapped page
14
+ # @param src [String]
15
+ # @param hit [RelatonGb::Hit]
15
16
  # @return [Hash]
16
- def scrapped_data(doc, src:)
17
+ def scrapped_data(doc, src, hit)
17
18
  {
18
- committee: get_committee(doc),
19
- docid: get_docid(doc),
19
+ committee: get_committee(doc, hit.docref),
20
+ docid: get_docid(hit.docref),
20
21
  title: get_titles(doc),
21
- contributor: get_contributors(doc),
22
- type: get_type(doc),
23
- docstatus: get_status(doc),
24
- gbtype: get_gbtype(doc),
22
+ contributor: get_contributors(doc, hit.docref),
23
+ type: get_type,
24
+ docstatus: get_status(doc, hit.status),
25
+ gbtype: get_gbtype(doc, hit.docref),
25
26
  ccs: get_ccs(doc),
26
27
  ics: get_ics(doc),
27
28
  link: [{ type: "src", content: src }],
28
29
  date: get_dates(doc),
29
30
  language: ["zh"],
30
31
  script: ["Hans"],
31
- structuredidentifier: fetch_structuredidentifier(doc),
32
+ structuredidentifier: fetch_structuredidentifier(hit.docref),
32
33
  }
33
34
  end
34
35
  # rubocop:enable Metrics/MethodLength
35
36
 
36
- # @param doc [Nokogiri::HTML::Document]
37
- # @param xpt [String]
37
+ # @param docref [String]
38
38
  # @return [Array<RelatonBib::DocumentIdentifier>]
39
- def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
40
- item_ref = doc.at xpt
41
- return [] unless item_ref
42
-
43
- [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
39
+ def get_docid(docref)
40
+ [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
44
41
  end
45
42
 
46
- # @param doc [Nokogiri::HTML::Document]
47
- # @param xpt [String]
43
+ # @param docref [String]
48
44
  # @return [RelatonIsoBib::StructuredIdentifier]
49
- def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
50
- item_ref = doc.at xpt
51
- unless item_ref
52
- return RelatonIsoBib::StructuredIdentifier.new(
53
- project_number: "?", part_number: "?", prefix: nil, id: "?",
54
- type: "Chinese Standard"
55
- )
56
- end
57
-
58
- m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
59
- # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
45
+ def fetch_structuredidentifier(docref)
46
+ m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
60
47
  RelatonIsoBib::StructuredIdentifier.new(
61
48
  project_number: m[1], part_number: m[2], prefix: nil,
62
- id: item_ref.text, type: "Chinese Standard"
49
+ id: docref, type: "Chinese Standard"
63
50
  )
64
51
  end
65
52
 
66
- def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
53
+ # @param doc [Nokogiri::HTML::Document]
54
+ # @param docref [Strings]
55
+ # @return [Array<Hash>]
56
+ def get_contributors(doc, docref)
67
57
  gb_en = GbAgencies::Agencies.new("en", {}, "")
68
58
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
69
- name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
59
+ name = docref.match(/^[^\s]+/).to_s
70
60
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
71
- gbtype = get_gbtype(doc)
61
+ gbtype = get_gbtype(doc, docref)
72
62
  entity = RelatonBib::Organization.new name: [
73
63
  { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
74
64
  { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
@@ -83,57 +73,56 @@ module RelatonGb
83
73
  # * :language [String]
84
74
  # * :script [String]
85
75
  def get_titles(doc)
86
- titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
87
- language: "zh", script: "Hans" }]
88
- title_main = doc.css("div.page-header h5").text
76
+ titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text,
77
+ title_intro: nil, language: "zh", script: "Hans" }]
78
+ title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
89
79
  unless title_main.empty?
90
80
  titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
91
81
  end
92
82
  titles
93
83
  end
94
84
 
95
- def get_type(_doc)
96
- "international-standard"
85
+ def get_type
86
+ "standard"
97
87
  end
98
88
 
99
89
  # @param doc [Nokogiri::HTML::Document]
100
- # @param xpt [String]
90
+ # @param status [String, NilClass]
101
91
  # @return [RelatonBib::DocumentStatus]
102
- def get_status(doc, xpt = ".s-status.label:nth-child(3)")
103
- case doc.at(xpt).text.gsub(/\s/, "")
104
- when "即将实施"
105
- stage = "published"
106
- when "现行"
107
- stage = "activated"
108
- when "废止"
109
- stage = "obsoleted"
110
- end
92
+ def get_status(doc, status = nil)
93
+ stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text
94
+ when "即将实施" then "published"
95
+ when "现行" then "activated"
96
+ when "废止" then "obsoleted"
97
+ end
111
98
  RelatonBib::DocumentStatus.new stage: stage
112
99
  end
113
100
 
114
101
  private
115
102
 
116
103
  # @param doc [Nokogiri::HTML::Document]
104
+ # @param ref [String]
117
105
  # @return [Hash]
118
106
  # * :scope [String]
119
107
  # * :prefix [String]
120
108
  # * :mandate [String]
121
- def get_gbtype(doc)
122
- ref = get_ref(doc)
109
+ def get_gbtype(doc, ref)
110
+ # ref = get_ref(doc)
123
111
  { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
124
112
  mandate: get_mandate(ref) }
125
113
  end
126
114
 
127
115
  # @param doc [Nokogiri::HTML::Document]
128
116
  # @return [String]
129
- def get_ref(doc)
130
- doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
131
- end
117
+ # def get_ref(doc)
118
+ # doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
119
+ # end
132
120
 
133
121
  # @param doc [Nokogiri::HTML::Document]
134
122
  # @return [Array<String>]
135
123
  def get_ccs(doc)
136
- [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
124
+ [doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
125
+ text.delete("\r\n\t\t")]
137
126
  end
138
127
 
139
128
  # @param doc [Nokogiri::HTML::Document]
@@ -142,21 +131,21 @@ module RelatonGb
142
131
  # * :group [String]
143
132
  # * :subgroup [String]
144
133
  def get_ics(doc)
145
- ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
146
- return [] if ics.empty?
134
+ ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
135
+ " | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
136
+ return [] unless ics
147
137
 
148
- field, group, subgroup = ics.text.split "."
138
+ field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
149
139
  [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
150
140
  end
151
141
 
152
142
  # @param doc [Nokogiri::HTML::Document]
153
143
  # @return [String]
154
144
  def get_scope(doc)
155
- scope = doc.at(".s-status.label-info").text
156
- if scope == "国家标准"
157
- "national"
158
- elsif scope =~ /^行业标准/
159
- "sector"
145
+ issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
146
+ case issued&.text
147
+ when /国家标准/ then "national"
148
+ when /^行业标准/ then "sector"
160
149
  end
161
150
  end
162
151
 
@@ -170,8 +159,7 @@ module RelatonGb
170
159
  # @param pref [String]
171
160
  # @return [Hash{String=>String}]
172
161
  def prefix(pref)
173
- file_path = File.join(__dir__, "yaml/prefixes.yaml")
174
- @prefixes ||= YAML.load_file(file_path)
162
+ @prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
175
163
  @prefixes[pref]
176
164
  end
177
165
 
@@ -190,8 +178,9 @@ module RelatonGb
190
178
  # * :type [String] type of date
191
179
  # * :on [String] date
192
180
  def get_dates(doc)
193
- date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
194
- [{ type: "published", on: date }]
181
+ date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
182
+ " | //dt[contains(text(), '发布日期')]/following-sibling::dd")
183
+ [{ type: "published", on: date.text.delete("\r\n\t\t") }]
195
184
  end
196
185
  end
197
186
  end
@@ -18,42 +18,77 @@ module RelatonGb
18
18
  # @param text [String] code of standard for serarch
19
19
  # @return [RelatonGb::HitCollection]
20
20
  def scrape_page(text)
21
- uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
- res = JSON.parse Net::HTTP.get(uri)
23
- hits = res["rows"].map do |r|
24
- Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
21
+ # uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
+ uri = URI "http://hbba.sacinfo.org.cn/stdQueryList"
23
+ resp = Net::HTTP.post uri, URI.encode_www_form({ key: text })
24
+ # res = JSON.parse Net::HTTP.get(uri)
25
+ json = JSON.parse resp.body
26
+ hits = json["records"].map do |h|
27
+ Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self
25
28
  end
29
+ # hits = res["rows"].map do |r|
30
+ # Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
31
+ # end
26
32
  HitCollection.new hits
27
33
  rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
28
34
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
29
- OpenSSL::SSL::SSLError
35
+ OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
30
36
  raise RelatonBib::RequestError, "Cannot access #{uri}"
31
37
  end
32
38
 
33
- # @param pid [String] standard's page id
39
+ # @param hit [RelatonGb::Hit]
34
40
  # @return [RelatonGb::GbBibliographicItem]
35
- def scrape_doc(pid)
36
- src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
41
+ def scrape_doc(hit)
42
+ src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}"
37
43
  page_uri = URI src
38
44
  doc = Nokogiri::HTML Net::HTTP.get(page_uri)
39
- GbBibliographicItem.new scrapped_data(doc, src: src)
45
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
40
46
  rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
41
47
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
42
- OpenSSL::SSL::SSLError
48
+ OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
43
49
  raise RelatonBib::RequestError, "Cannot access #{src}"
44
50
  end
45
51
 
46
52
  private
47
53
 
48
54
  # @param doc [Nokogiri::HTML::Document]
55
+ # @return [Array<Hash>]
56
+ # * :title_intro [String]
57
+ # * :title_main [String]
58
+ # * :language [String]
59
+ # * :script [String]
60
+ def get_titles(doc)
61
+ titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"),
62
+ title_intro: nil, language: "zh", script: "Hans" }]
63
+ # title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
64
+ # unless title_main.empty?
65
+ # titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
66
+ # end
67
+ titles
68
+ end
69
+
70
+ # @param _doc [Nokogiri::HTML::Document]
71
+ # @param ref [String]
49
72
  # @return [Hash]
50
73
  # * :type [String]
51
74
  # * :name [String]
52
- def get_committee(doc)
53
- ref = get_ref(doc)
75
+ def get_committee(_doc, ref)
76
+ # ref = get_ref(doc)
54
77
  name = get_prefix(ref)["administration"]
55
78
  { type: "technical", name: name }
56
79
  end
80
+
81
+ # @param _doc [Nokogiri::HTML::Document]
82
+ # @return [String]
83
+ def get_scope(_doc)
84
+ "sector"
85
+ end
86
+
87
+ # @param doc [Nokogiri::HTML::Document]
88
+ # @return [Array<String>]
89
+ def get_ccs(doc)
90
+ [doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text]
91
+ end
57
92
  end
58
93
  end
59
94
  end
@@ -2,6 +2,7 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require "open-uri"
5
+ require "net/http"
5
6
  require "nokogiri"
6
7
  require "relaton_gb/scrapper"
7
8
  require "relaton_gb/gb_bibliographic_item"
@@ -21,13 +22,15 @@ module RelatonGb
21
22
  search_html = OpenURI.open_uri(
22
23
  "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
23
24
  CGI.escape(text.tr("-", [8212].pack("U"))),
24
- )
25
+ ).read
25
26
  header = Nokogiri::HTML search_html
26
27
  xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
- t_xpath = "../preceding-sibling::td[3]"
28
+ t_xpath = "../preceding-sibling::td[4]"
28
29
  hits = header.xpath(xpath).map do |h|
29
- title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
30
- Hit.new pid: h[:href].sub(%r{\/$}, ""), title: title, scrapper: self
30
+ docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
31
+ status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
32
+ pid = h[:href].sub(%r{\/$}, "")
33
+ Hit.new pid: pid, docref: docref, status: status, scrapper: self
31
34
  end
32
35
  HitCollection.new hits
33
36
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
@@ -35,12 +38,12 @@ module RelatonGb
35
38
  end
36
39
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
37
40
 
38
- # @param pid [String] standard's page path
41
+ # @param hit [RelatonGb::Hit] standard's page path
39
42
  # @return [RelatonGb::GbBibliographicItem]
40
- def scrape_doc(pid)
41
- src = "http://www.ttbz.org.cn#{pid}"
43
+ def scrape_doc(hit)
44
+ src = "http://www.ttbz.org.cn#{hit.pid}"
42
45
  doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
43
- GbBibliographicItem.new scrapped_data(doc, src: src)
46
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
44
47
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
45
48
  raise RelatonBib::RequestError, "Cannot access #{src}"
46
49
  end
@@ -49,16 +52,18 @@ module RelatonGb
49
52
 
50
53
  # rubocop:disable Metrics/MethodLength
51
54
  # @param doc [Nokogiri::HTML::Document]
55
+ # @param src [String]
56
+ # @param hit [RelatonGb::Hit]
52
57
  # @return [Hash]
53
- def scrapped_data(doc, src:)
54
- docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
55
- status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
58
+ def scrapped_data(doc, src, hit)
59
+ # docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
60
+ # status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
56
61
  {
57
- committee: get_committee(doc),
58
- docid: get_docid(doc, docid_xpt),
62
+ committee: get_committee(doc, hit.docref),
63
+ docid: get_docid(hit.docref),
59
64
  title: get_titles(doc),
60
- type: "international-standard",
61
- docstatus: get_status(doc, status_xpt),
65
+ type: get_type,
66
+ docstatus: get_status(doc, hit.status),
62
67
  gbtype: gbtype,
63
68
  ccs: get_ccs(doc),
64
69
  ics: get_ics(doc),
@@ -66,12 +71,12 @@ module RelatonGb
66
71
  date: get_dates(doc),
67
72
  language: ["zh"],
68
73
  script: ["Hans"],
69
- structuredidentifier: fetch_structuredidentifier(doc),
74
+ structuredidentifier: fetch_structuredidentifier(hit.docref),
70
75
  }
71
76
  end
72
77
  # rubocop:enable Metrics/MethodLength
73
78
 
74
- def get_committee(doc)
79
+ def get_committee(doc, _ref)
75
80
  {
76
81
  name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
77
82
  type: "technical",
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonGb
4
- VERSION = "0.6.2"
4
+ VERSION = "0.6.3"
5
5
  end
data/lib/relaton_gb.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "relaton_gb/version"
2
2
  require "relaton_gb/gb_bibliography"
3
3
 
4
- if defined? Relaton
5
- require_relative "relaton/processor"
6
- Relaton::Registry.instance.register Relaton::RelatonGb::Processor
7
- end
4
+ # if defined? Relaton
5
+ # require "relaton_gb/processor"
6
+ # # don't register the gem if it's required form relaton's registry
7
+ # return if caller.detect { |c| c.include? "register_gems" }
8
+
9
+ # Relaton::Registry.instance.register RelatonGb::Processor
10
+ # end
data/relaton_gb.gemspec CHANGED
@@ -32,6 +32,8 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "rspec", "~> 3.0"
33
33
  spec.add_development_dependency "ruby-debug-ide"
34
34
  spec.add_development_dependency "simplecov"
35
+ spec.add_development_dependency "vcr"
36
+ spec.add_development_dependency "webmock"
35
37
 
36
38
  spec.add_dependency "cnccs", "~> 0.1.1"
37
39
  spec.add_dependency "gb-agencies", "~> 0.0.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-gb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-20 00:00:00.000000000 Z
11
+ date: 2019-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,34 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: vcr
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  - !ruby/object:Gem::Dependency
126
154
  name: cnccs
127
155
  requirement: !ruby/object:Gem::Requirement
@@ -185,7 +213,6 @@ files:
185
213
  - appveyor.yml
186
214
  - bin/console
187
215
  - bin/setup
188
- - lib/relaton/processor.rb
189
216
  - lib/relaton_gb.rb
190
217
  - lib/relaton_gb/ccs.rb
191
218
  - lib/relaton_gb/gb_bibliographic_item.rb
@@ -196,6 +223,7 @@ files:
196
223
  - lib/relaton_gb/hash_converter.rb
197
224
  - lib/relaton_gb/hit.rb
198
225
  - lib/relaton_gb/hit_collection.rb
226
+ - lib/relaton_gb/processor.rb
199
227
  - lib/relaton_gb/scrapper.rb
200
228
  - lib/relaton_gb/sec_scrapper.rb
201
229
  - lib/relaton_gb/t_scrapper.rb
@@ -1,24 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "relaton/processor"
4
-
5
- module Relaton
6
- module RelatonGb
7
- class Processor < Relaton::Processor
8
- def initialize
9
- @short = :relaton_gb
10
- @prefix = "CN"
11
- @defaultprefix = %r{^GB }
12
- @idtype = "Chinese Standard"
13
- end
14
-
15
- def get(code, date, opts)
16
- ::RelatonGb::GbBibliography.get(code, date, opts)
17
- end
18
-
19
- def from_xml(xml)
20
- ::RelatonGb::XMLParser.from_xml xml
21
- end
22
- end
23
- end
24
- end