relaton-gb 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c0978ce217698027cca98e48fcad3fdf3060af8f
4
- data.tar.gz: 3f9e4857d33b1b3aee204746098c003816ded415
3
+ metadata.gz: 64d418169f8f5f6c500a43e913ea94d13d4dca7d
4
+ data.tar.gz: 2717d29fdc48172660535462cade3eb856dcf362
5
5
  SHA512:
6
- metadata.gz: 6a52a32b85031f0b31005a562fa3c95e630f9a33e353b7b0371eee23ebb8483430cf770bc16f900406357609ecbabae3dcb8dab1af6e2ac5a9151866bb03ceb0
7
- data.tar.gz: 9d3ab095b5681268b8113789b53c0b60140e4faf427918f29561460454fec99b9cff46bb9d39d5b6480df341f7d23396c17bcb8258fa95f413965a7f136f4e96
6
+ metadata.gz: e372dd88445fdb97b2bda4c41b755bb8b325ade7c271f0db0e567c697c7b4c80b6d1da24c7af1e8439fee87e9e9cf1b2ff2940e1dd86d5146c1efa73ec048e4b
7
+ data.tar.gz: ee2e5d7edc8f836505a024bff24e543fa70a05d9508885a3ead36b798a0750aa661a825f3f33250b00740a78b5edfb02d5610d616902aa7899d3ea75046e7594
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- relaton-gb (0.6.2)
4
+ relaton-gb (0.6.3)
5
5
  cnccs (~> 0.1.1)
6
6
  gb-agencies (~> 0.0.1)
7
7
  relaton-iso-bib (~> 0.3.0)
@@ -9,11 +9,13 @@ PATH
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
12
+ addressable (2.7.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
14
  byebug (11.0.1)
15
15
  cnccs (0.1.3)
16
16
  coderay (1.1.2)
17
+ crack (0.4.3)
18
+ safe_yaml (~> 1.0.0)
17
19
  debase (0.2.2)
18
20
  debase-ruby_core_source (>= 0.10.2)
19
21
  debase-ruby_core_source (0.10.5)
@@ -22,6 +24,7 @@ GEM
22
24
  equivalent-xml (0.6.0)
23
25
  nokogiri (>= 1.4.3)
24
26
  gb-agencies (0.0.5)
27
+ hashdiff (1.0.0)
25
28
  isoics (0.1.7)
26
29
  json (2.2.0)
27
30
  method_source (0.9.2)
@@ -34,12 +37,12 @@ GEM
34
37
  pry-byebug (3.7.0)
35
38
  byebug (~> 11.0)
36
39
  pry (~> 0.10)
37
- public_suffix (3.1.1)
40
+ public_suffix (4.0.1)
38
41
  rake (10.5.0)
39
- relaton-bib (0.3.5)
42
+ relaton-bib (0.3.6)
40
43
  addressable
41
- nokogiri (~> 1.10)
42
- relaton-iso-bib (0.3.4)
44
+ nokogiri
45
+ relaton-iso-bib (0.3.5)
43
46
  isoics (~> 0.1.6)
44
47
  relaton-bib (~> 0.3.0)
45
48
  ruby_deep_clone (~> 0.8.0)
@@ -59,11 +62,17 @@ GEM
59
62
  ruby-debug-ide (0.7.0)
60
63
  rake (>= 0.8.1)
61
64
  ruby_deep_clone (0.8.0)
65
+ safe_yaml (1.0.5)
62
66
  simplecov (0.16.1)
63
67
  docile (~> 1.1)
64
68
  json (>= 1.8, < 3)
65
69
  simplecov-html (~> 0.10.0)
66
70
  simplecov-html (0.10.2)
71
+ vcr (5.0.0)
72
+ webmock (3.7.0)
73
+ addressable (>= 2.3.6)
74
+ crack (>= 0.3.2)
75
+ hashdiff (>= 0.4.0, < 2.0.0)
67
76
 
68
77
  PLATFORMS
69
78
  ruby
@@ -78,6 +87,8 @@ DEPENDENCIES
78
87
  rspec (~> 3.0)
79
88
  ruby-debug-ide
80
89
  simplecov
90
+ vcr
91
+ webmock
81
92
 
82
93
  BUNDLED WITH
83
94
  2.0.1
@@ -91,16 +91,11 @@ module RelatonGb
91
91
  def search_filter(code)
92
92
  # search filter needs to incorporate year
93
93
  docidrx = %r{^[^\s]+\s[\d\.-]+}
94
- # corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
95
94
  warn "fetching #{code}..."
96
95
  result = search(code)
97
- ret = result.select do |hit|
98
- hit.title && hit.title.match(docidrx).to_s == code # &&
99
- # !corrigrx =~ hit.title
96
+ result.select do |hit|
97
+ hit.docref && hit.docref.match(docidrx).to_s.include?(code)
100
98
  end
101
- return ret unless ret.empty?
102
-
103
- []
104
99
  end
105
100
 
106
101
  # Sort through the results from Isobib, fetching them three at a time,
@@ -125,12 +120,15 @@ module RelatonGb
125
120
  { years: missed_years }
126
121
  end
127
122
 
128
- def fetch_pages(s, n)
129
- workers = RelatonBib::WorkersPool.new n
123
+ # @param hits [RelatonBib::HitCollection<RelatonBib::Hit>]
124
+ # @param threads [Integer]
125
+ # @return [Array<RelatonBib::GbBibliographicItem>]
126
+ def fetch_pages(hits, threads)
127
+ workers = RelatonBib::WorkersPool.new threads
130
128
  workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
131
- s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
129
+ hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
132
130
  workers.end
133
- workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
131
+ workers.result.sort_by { |x| x[:i] }.map { |x| x[:hit] }
134
132
  end
135
133
  end
136
134
  end
@@ -16,35 +16,40 @@ module RelatonGb
16
16
  # @return [RelatonGb::HitCollection]
17
17
  def scrape_page(text)
18
18
  search_html = OpenURI.open_uri(
19
- "http://www.std.gov.cn/search/stdPage?q=" + text
19
+ "http://openstd.samr.gov.cn/bzgk/gb/std_list?p.p2=" + text
20
20
  )
21
21
  result = Nokogiri::HTML search_html
22
- hits = result.css(".s-title a").map do |h|
23
- Hit.new pid: h[:pid], title: h.text, scrapper: self
22
+ hits = result.xpath(
23
+ "//table[contains(@class, 'result_list')]/tbody[2]/tr",
24
+ ).map do |h|
25
+ ref = h.at "./td[2]/a"
26
+ pid = ref[:onclick].match(/[0-9A-F]+/).to_s
27
+ rdate = h.at("./td[7]").text
28
+ Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
24
29
  end
25
- HitCollection.new hits
30
+ HitCollection.new hits.sort_by(&:release_date).reverse
26
31
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
27
- raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/search/stdPage"
32
+ raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/bzgk/gb/std_list"
28
33
  end
29
34
 
30
- # @param pid [Strin] standard's page id
35
+ # @param hit [RelatonGb::Hit] standard's page id
31
36
  # @return [RelatonGb::GbBibliographicItem]
32
- def scrape_doc(pid)
33
- src = "http://www.std.gov.cn/gb/search/gbDetailed?id=" + pid
37
+ def scrape_doc(hit)
38
+ src = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" + hit.pid
34
39
  doc = Nokogiri::HTML OpenURI.open_uri(src)
35
- GbBibliographicItem.new scrapped_data(doc, src: src)
40
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
36
41
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
37
42
  raise RelatonBib::RequestError, "Cannot access #{src}"
38
43
  end
39
44
 
40
45
  # @param doc [Nokogiri::HTML]
46
+ # @param _ref [String]
41
47
  # @return [Hash]
42
48
  # * :type [String]
43
49
  # * :name [String]
44
- def get_committee(doc)
45
- name = doc.xpath("//p/a[1]/following-sibling::text()").text.
46
- match(/(?<=()[^)]+/).to_s
47
- { type: "technical", name: name }
50
+ def get_committee(doc, _ref)
51
+ name = doc.at("//div[contains(text(), '归口单位')]/following-sibling::div")
52
+ { type: "technical", name: name.text.delete("\r\n\t\t") }
48
53
  end
49
54
  end
50
55
  end
@@ -7,28 +7,36 @@ module RelatonGb
7
7
  attr_reader :hit_collection
8
8
 
9
9
  # @return [String]
10
- attr_reader :pid
10
+ attr_reader :pid, :docref
11
11
 
12
- # @return [String]
13
- attr_reader :title
12
+ # @return [Date, NilClass]
13
+ attr_reader :release_date
14
+
15
+ # @return [String, NilClass]
16
+ attr_reader :status
14
17
 
15
18
  # @return [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
16
19
  attr_reader :scrapper
17
20
 
18
- # @param hit [Hash]
19
- # @param hit_collection [Isobib:HitCollection]
20
- def initialize(pid:, title:, hit_collection: nil, scrapper:)
21
+ # @param pid [String]
22
+ # @param docref [String]
23
+ # @parma scrapper [RelatonGb::GbScrapper, RelatonGb::SecScraper, RelatonGb::TScrapper]
24
+ # @param release_date [String]
25
+ # @status [String, NilClass]
26
+ # @param hit_collection [RelatonGb:HitCollection, NilClass]
27
+ def initialize(pid:, docref:, scrapper:, **args)
21
28
  @pid = pid
22
- @title = title
23
- @hit_collection = hit_collection
29
+ @docref = docref
24
30
  @scrapper = scrapper
25
- self.hit_collection << self if hit_collection
31
+ @release_date = Date.parse args[:release_date] if args[:release_date]
32
+ @status = args[:status]
33
+ @hit_collection = args[:hit_collection]
26
34
  end
27
35
 
28
36
  # Parse page.
29
37
  # @return [Isobib::IsoBibliographicItem]
30
38
  def fetch
31
- @fetch ||= scrapper.scrape_doc pid
39
+ @fetch ||= scrapper.scrape_doc self
32
40
  end
33
41
 
34
42
  # @return [String]
@@ -40,21 +48,7 @@ module RelatonGb
40
48
  def inspect
41
49
  "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
42
50
  "@fullIdentifier=\"#{@fetch&.shortref}\" "\
43
- "@title=\"#{title}\">"
51
+ "@docref=\"#{docref}\">"
44
52
  end
45
-
46
- # @param builder [Nokogiri::XML::Builder]
47
- # @param opts [Hash]
48
- # @return [String]
49
- # def to_xml(builder = nil, opts = {})
50
- # if builder
51
- # fetch.to_xml builder, opts
52
- # else
53
- # builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
54
- # fetch.to_xml xml, opts
55
- # end
56
- # builder.doc.root.to_xml
57
- # end
58
- # end
59
53
  end
60
54
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton/processor"
4
+
5
+ module RelatonGb
6
+ class Processor < Relaton::Processor
7
+ def initialize
8
+ @short = :relaton_gb
9
+ @prefix = "CN"
10
+ @defaultprefix = %r{^GB }
11
+ @idtype = "Chinese Standard"
12
+ end
13
+
14
+ # @param code [String]
15
+ # @param date [String, NilClass] year
16
+ # @param opts [Hash]
17
+ # @return [RelatonGb::GbBibliographicItem]
18
+ def get(code, date, opts)
19
+ ::RelatonGb::GbBibliography.get(code, date, opts)
20
+ end
21
+
22
+ # @param xml [String]
23
+ # @return [RelatonGb::GbBibliographicItem]
24
+ def from_xml(xml)
25
+ ::RelatonGb::XMLParser.from_xml xml
26
+ end
27
+
28
+ # @param hash [Hash]
29
+ # @return [RelatonGb::GbBibliographicItem]
30
+ def hash_to_bib(hash)
31
+ item_hash = ::RelatonGb::HashConverter.hash_to_bib(hash)
32
+ ::RelatonGb::GbBibliographicItem.new item_hash
33
+ end
34
+ end
35
+ end
@@ -11,64 +11,54 @@ module RelatonGb
11
11
 
12
12
  # rubocop:disable Metrics/MethodLength
13
13
  # @param doc [Nokogiri::HTML::Document]
14
- # @param src [String] url of scrapped page
14
+ # @param src [String]
15
+ # @param hit [RelatonGb::Hit]
15
16
  # @return [Hash]
16
- def scrapped_data(doc, src:)
17
+ def scrapped_data(doc, src, hit)
17
18
  {
18
- committee: get_committee(doc),
19
- docid: get_docid(doc),
19
+ committee: get_committee(doc, hit.docref),
20
+ docid: get_docid(hit.docref),
20
21
  title: get_titles(doc),
21
- contributor: get_contributors(doc),
22
- type: get_type(doc),
23
- docstatus: get_status(doc),
24
- gbtype: get_gbtype(doc),
22
+ contributor: get_contributors(doc, hit.docref),
23
+ type: get_type,
24
+ docstatus: get_status(doc, hit.status),
25
+ gbtype: get_gbtype(doc, hit.docref),
25
26
  ccs: get_ccs(doc),
26
27
  ics: get_ics(doc),
27
28
  link: [{ type: "src", content: src }],
28
29
  date: get_dates(doc),
29
30
  language: ["zh"],
30
31
  script: ["Hans"],
31
- structuredidentifier: fetch_structuredidentifier(doc),
32
+ structuredidentifier: fetch_structuredidentifier(hit.docref),
32
33
  }
33
34
  end
34
35
  # rubocop:enable Metrics/MethodLength
35
36
 
36
- # @param doc [Nokogiri::HTML::Document]
37
- # @param xpt [String]
37
+ # @param docref [String]
38
38
  # @return [Array<RelatonBib::DocumentIdentifier>]
39
- def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
40
- item_ref = doc.at xpt
41
- return [] unless item_ref
42
-
43
- [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
39
+ def get_docid(docref)
40
+ [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
44
41
  end
45
42
 
46
- # @param doc [Nokogiri::HTML::Document]
47
- # @param xpt [String]
43
+ # @param docref [String]
48
44
  # @return [RelatonIsoBib::StructuredIdentifier]
49
- def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
50
- item_ref = doc.at xpt
51
- unless item_ref
52
- return RelatonIsoBib::StructuredIdentifier.new(
53
- project_number: "?", part_number: "?", prefix: nil, id: "?",
54
- type: "Chinese Standard"
55
- )
56
- end
57
-
58
- m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
59
- # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
45
+ def fetch_structuredidentifier(docref)
46
+ m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
60
47
  RelatonIsoBib::StructuredIdentifier.new(
61
48
  project_number: m[1], part_number: m[2], prefix: nil,
62
- id: item_ref.text, type: "Chinese Standard"
49
+ id: docref, type: "Chinese Standard"
63
50
  )
64
51
  end
65
52
 
66
- def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
53
+ # @param doc [Nokogiri::HTML::Document]
54
+ # @param docref [Strings]
55
+ # @return [Array<Hash>]
56
+ def get_contributors(doc, docref)
67
57
  gb_en = GbAgencies::Agencies.new("en", {}, "")
68
58
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
69
- name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
59
+ name = docref.match(/^[^\s]+/).to_s
70
60
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
71
- gbtype = get_gbtype(doc)
61
+ gbtype = get_gbtype(doc, docref)
72
62
  entity = RelatonBib::Organization.new name: [
73
63
  { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
74
64
  { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
@@ -83,57 +73,56 @@ module RelatonGb
83
73
  # * :language [String]
84
74
  # * :script [String]
85
75
  def get_titles(doc)
86
- titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
87
- language: "zh", script: "Hans" }]
88
- title_main = doc.css("div.page-header h5").text
76
+ titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text,
77
+ title_intro: nil, language: "zh", script: "Hans" }]
78
+ title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
89
79
  unless title_main.empty?
90
80
  titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
91
81
  end
92
82
  titles
93
83
  end
94
84
 
95
- def get_type(_doc)
96
- "international-standard"
85
+ def get_type
86
+ "standard"
97
87
  end
98
88
 
99
89
  # @param doc [Nokogiri::HTML::Document]
100
- # @param xpt [String]
90
+ # @param status [String, NilClass]
101
91
  # @return [RelatonBib::DocumentStatus]
102
- def get_status(doc, xpt = ".s-status.label:nth-child(3)")
103
- case doc.at(xpt).text.gsub(/\s/, "")
104
- when "即将实施"
105
- stage = "published"
106
- when "现行"
107
- stage = "activated"
108
- when "废止"
109
- stage = "obsoleted"
110
- end
92
+ def get_status(doc, status = nil)
93
+ stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text
94
+ when "即将实施" then "published"
95
+ when "现行" then "activated"
96
+ when "废止" then "obsoleted"
97
+ end
111
98
  RelatonBib::DocumentStatus.new stage: stage
112
99
  end
113
100
 
114
101
  private
115
102
 
116
103
  # @param doc [Nokogiri::HTML::Document]
104
+ # @param ref [String]
117
105
  # @return [Hash]
118
106
  # * :scope [String]
119
107
  # * :prefix [String]
120
108
  # * :mandate [String]
121
- def get_gbtype(doc)
122
- ref = get_ref(doc)
109
+ def get_gbtype(doc, ref)
110
+ # ref = get_ref(doc)
123
111
  { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
124
112
  mandate: get_mandate(ref) }
125
113
  end
126
114
 
127
115
  # @param doc [Nokogiri::HTML::Document]
128
116
  # @return [String]
129
- def get_ref(doc)
130
- doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
131
- end
117
+ # def get_ref(doc)
118
+ # doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
119
+ # end
132
120
 
133
121
  # @param doc [Nokogiri::HTML::Document]
134
122
  # @return [Array<String>]
135
123
  def get_ccs(doc)
136
- [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
124
+ [doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
125
+ text.delete("\r\n\t\t")]
137
126
  end
138
127
 
139
128
  # @param doc [Nokogiri::HTML::Document]
@@ -142,21 +131,21 @@ module RelatonGb
142
131
  # * :group [String]
143
132
  # * :subgroup [String]
144
133
  def get_ics(doc)
145
- ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
146
- return [] if ics.empty?
134
+ ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
135
+ " | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
136
+ return [] unless ics
147
137
 
148
- field, group, subgroup = ics.text.split "."
138
+ field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
149
139
  [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
150
140
  end
151
141
 
152
142
  # @param doc [Nokogiri::HTML::Document]
153
143
  # @return [String]
154
144
  def get_scope(doc)
155
- scope = doc.at(".s-status.label-info").text
156
- if scope == "国家标准"
157
- "national"
158
- elsif scope =~ /^行业标准/
159
- "sector"
145
+ issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
146
+ case issued&.text
147
+ when /国家标准/ then "national"
148
+ when /^行业标准/ then "sector"
160
149
  end
161
150
  end
162
151
 
@@ -170,8 +159,7 @@ module RelatonGb
170
159
  # @param pref [String]
171
160
  # @return [Hash{String=>String}]
172
161
  def prefix(pref)
173
- file_path = File.join(__dir__, "yaml/prefixes.yaml")
174
- @prefixes ||= YAML.load_file(file_path)
162
+ @prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
175
163
  @prefixes[pref]
176
164
  end
177
165
 
@@ -190,8 +178,9 @@ module RelatonGb
190
178
  # * :type [String] type of date
191
179
  # * :on [String] date
192
180
  def get_dates(doc)
193
- date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
194
- [{ type: "published", on: date }]
181
+ date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
182
+ " | //dt[contains(text(), '发布日期')]/following-sibling::dd")
183
+ [{ type: "published", on: date.text.delete("\r\n\t\t") }]
195
184
  end
196
185
  end
197
186
  end
@@ -18,42 +18,77 @@ module RelatonGb
18
18
  # @param text [String] code of standard for serarch
19
19
  # @return [RelatonGb::HitCollection]
20
20
  def scrape_page(text)
21
- uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
- res = JSON.parse Net::HTTP.get(uri)
23
- hits = res["rows"].map do |r|
24
- Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
21
+ # uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
22
+ uri = URI "http://hbba.sacinfo.org.cn/stdQueryList"
23
+ resp = Net::HTTP.post uri, URI.encode_www_form({ key: text })
24
+ # res = JSON.parse Net::HTTP.get(uri)
25
+ json = JSON.parse resp.body
26
+ hits = json["records"].map do |h|
27
+ Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self
25
28
  end
29
+ # hits = res["rows"].map do |r|
30
+ # Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
31
+ # end
26
32
  HitCollection.new hits
27
33
  rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
28
34
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
29
- OpenSSL::SSL::SSLError
35
+ OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
30
36
  raise RelatonBib::RequestError, "Cannot access #{uri}"
31
37
  end
32
38
 
33
- # @param pid [String] standard's page id
39
+ # @param hit [RelatonGb::Hit]
34
40
  # @return [RelatonGb::GbBibliographicItem]
35
- def scrape_doc(pid)
36
- src = "http://www.std.gov.cn/hb/search/stdHBDetailed?id=#{pid}"
41
+ def scrape_doc(hit)
42
+ src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}"
37
43
  page_uri = URI src
38
44
  doc = Nokogiri::HTML Net::HTTP.get(page_uri)
39
- GbBibliographicItem.new scrapped_data(doc, src: src)
45
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
40
46
  rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
41
47
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
42
- OpenSSL::SSL::SSLError
48
+ OpenSSL::SSL::SSLError, Errno::ETIMEDOUT
43
49
  raise RelatonBib::RequestError, "Cannot access #{src}"
44
50
  end
45
51
 
46
52
  private
47
53
 
48
54
  # @param doc [Nokogiri::HTML::Document]
55
+ # @return [Array<Hash>]
56
+ # * :title_intro [String]
57
+ # * :title_main [String]
58
+ # * :language [String]
59
+ # * :script [String]
60
+ def get_titles(doc)
61
+ titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"),
62
+ title_intro: nil, language: "zh", script: "Hans" }]
63
+ # title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
64
+ # unless title_main.empty?
65
+ # titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
66
+ # end
67
+ titles
68
+ end
69
+
70
+ # @param _doc [Nokogiri::HTML::Document]
71
+ # @param ref [String]
49
72
  # @return [Hash]
50
73
  # * :type [String]
51
74
  # * :name [String]
52
- def get_committee(doc)
53
- ref = get_ref(doc)
75
+ def get_committee(_doc, ref)
76
+ # ref = get_ref(doc)
54
77
  name = get_prefix(ref)["administration"]
55
78
  { type: "technical", name: name }
56
79
  end
80
+
81
+ # @param _doc [Nokogiri::HTML::Document]
82
+ # @return [String]
83
+ def get_scope(_doc)
84
+ "sector"
85
+ end
86
+
87
+ # @param doc [Nokogiri::HTML::Document]
88
+ # @return [Array<String>]
89
+ def get_ccs(doc)
90
+ [doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text]
91
+ end
57
92
  end
58
93
  end
59
94
  end
@@ -2,6 +2,7 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require "open-uri"
5
+ require "net/http"
5
6
  require "nokogiri"
6
7
  require "relaton_gb/scrapper"
7
8
  require "relaton_gb/gb_bibliographic_item"
@@ -21,13 +22,15 @@ module RelatonGb
21
22
  search_html = OpenURI.open_uri(
22
23
  "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" +
23
24
  CGI.escape(text.tr("-", [8212].pack("U"))),
24
- )
25
+ ).read
25
26
  header = Nokogiri::HTML search_html
26
27
  xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
27
- t_xpath = "../preceding-sibling::td[3]"
28
+ t_xpath = "../preceding-sibling::td[4]"
28
29
  hits = header.xpath(xpath).map do |h|
29
- title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
30
- Hit.new pid: h[:href].sub(%r{\/$}, ""), title: title, scrapper: self
30
+ docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
31
+ status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
32
+ pid = h[:href].sub(%r{\/$}, "")
33
+ Hit.new pid: pid, docref: docref, status: status, scrapper: self
31
34
  end
32
35
  HitCollection.new hits
33
36
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
@@ -35,12 +38,12 @@ module RelatonGb
35
38
  end
36
39
  # rubocop:enable Metrics/MethodLength, Metrics/AbcSize
37
40
 
38
- # @param pid [String] standard's page path
41
+ # @param hit [RelatonGb::Hit] standard's page path
39
42
  # @return [RelatonGb::GbBibliographicItem]
40
- def scrape_doc(pid)
41
- src = "http://www.ttbz.org.cn#{pid}"
43
+ def scrape_doc(hit)
44
+ src = "http://www.ttbz.org.cn#{hit.pid}"
42
45
  doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
43
- GbBibliographicItem.new scrapped_data(doc, src: src)
46
+ GbBibliographicItem.new scrapped_data(doc, src, hit)
44
47
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError
45
48
  raise RelatonBib::RequestError, "Cannot access #{src}"
46
49
  end
@@ -49,16 +52,18 @@ module RelatonGb
49
52
 
50
53
  # rubocop:disable Metrics/MethodLength
51
54
  # @param doc [Nokogiri::HTML::Document]
55
+ # @param src [String]
56
+ # @param hit [RelatonGb::Hit]
52
57
  # @return [Hash]
53
- def scrapped_data(doc, src:)
54
- docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
55
- status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
58
+ def scrapped_data(doc, src, hit)
59
+ # docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
60
+ # status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
56
61
  {
57
- committee: get_committee(doc),
58
- docid: get_docid(doc, docid_xpt),
62
+ committee: get_committee(doc, hit.docref),
63
+ docid: get_docid(hit.docref),
59
64
  title: get_titles(doc),
60
- type: "international-standard",
61
- docstatus: get_status(doc, status_xpt),
65
+ type: get_type,
66
+ docstatus: get_status(doc, hit.status),
62
67
  gbtype: gbtype,
63
68
  ccs: get_ccs(doc),
64
69
  ics: get_ics(doc),
@@ -66,12 +71,12 @@ module RelatonGb
66
71
  date: get_dates(doc),
67
72
  language: ["zh"],
68
73
  script: ["Hans"],
69
- structuredidentifier: fetch_structuredidentifier(doc),
74
+ structuredidentifier: fetch_structuredidentifier(hit.docref),
70
75
  }
71
76
  end
72
77
  # rubocop:enable Metrics/MethodLength
73
78
 
74
- def get_committee(doc)
79
+ def get_committee(doc, _ref)
75
80
  {
76
81
  name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
77
82
  type: "technical",
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonGb
4
- VERSION = "0.6.2"
4
+ VERSION = "0.6.3"
5
5
  end
data/lib/relaton_gb.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "relaton_gb/version"
2
2
  require "relaton_gb/gb_bibliography"
3
3
 
4
- if defined? Relaton
5
- require_relative "relaton/processor"
6
- Relaton::Registry.instance.register Relaton::RelatonGb::Processor
7
- end
4
+ # if defined? Relaton
5
+ # require "relaton_gb/processor"
6
+ # # don't register the gem if it's required form relaton's registry
7
+ # return if caller.detect { |c| c.include? "register_gems" }
8
+
9
+ # Relaton::Registry.instance.register RelatonGb::Processor
10
+ # end
data/relaton_gb.gemspec CHANGED
@@ -32,6 +32,8 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "rspec", "~> 3.0"
33
33
  spec.add_development_dependency "ruby-debug-ide"
34
34
  spec.add_development_dependency "simplecov"
35
+ spec.add_development_dependency "vcr"
36
+ spec.add_development_dependency "webmock"
35
37
 
36
38
  spec.add_dependency "cnccs", "~> 0.1.1"
37
39
  spec.add_dependency "gb-agencies", "~> 0.0.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-gb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-20 00:00:00.000000000 Z
11
+ date: 2019-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,34 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: vcr
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  - !ruby/object:Gem::Dependency
126
154
  name: cnccs
127
155
  requirement: !ruby/object:Gem::Requirement
@@ -185,7 +213,6 @@ files:
185
213
  - appveyor.yml
186
214
  - bin/console
187
215
  - bin/setup
188
- - lib/relaton/processor.rb
189
216
  - lib/relaton_gb.rb
190
217
  - lib/relaton_gb/ccs.rb
191
218
  - lib/relaton_gb/gb_bibliographic_item.rb
@@ -196,6 +223,7 @@ files:
196
223
  - lib/relaton_gb/hash_converter.rb
197
224
  - lib/relaton_gb/hit.rb
198
225
  - lib/relaton_gb/hit_collection.rb
226
+ - lib/relaton_gb/processor.rb
199
227
  - lib/relaton_gb/scrapper.rb
200
228
  - lib/relaton_gb/sec_scrapper.rb
201
229
  - lib/relaton_gb/t_scrapper.rb
@@ -1,24 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "relaton/processor"
4
-
5
- module Relaton
6
- module RelatonGb
7
- class Processor < Relaton::Processor
8
- def initialize
9
- @short = :relaton_gb
10
- @prefix = "CN"
11
- @defaultprefix = %r{^GB }
12
- @idtype = "Chinese Standard"
13
- end
14
-
15
- def get(code, date, opts)
16
- ::RelatonGb::GbBibliography.get(code, date, opts)
17
- end
18
-
19
- def from_xml(xml)
20
- ::RelatonGb::XMLParser.from_xml xml
21
- end
22
- end
23
- end
24
- end