relaton-cie 1.14.0 → 1.14.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0de5250a785ffc6f5b0c4e93a1011007bc9ddf72339746c94570ebf7b6a25a78
4
- data.tar.gz: 4173a47ac59c47492141c40731fcb498451675ef97430a563686cadcb1388f2c
3
+ metadata.gz: a55c7a70b3d2fe2a2300cf69bf6d1e000d684d9897106c1510c690fa598e994c
4
+ data.tar.gz: 4d57b3900f4092e4a94553c6c285eaf606bdfc3366571e0605d90f7445d799b9
5
5
  SHA512:
6
- metadata.gz: f7f5f59752a190ca8769df82b29de6051e6780b59eb81452cc3d2a441f66d6174906233a35085b5845f3eddeb448b876f5057dfe337e5e34a03922bbfbe2c35a
7
- data.tar.gz: f7d83a7aefcdbc7bae57b4d31fccb89f82a79285ab746494a5633241263d428ff1abe9be75042361e4ff58f3359578004d1da8c1f4a919b45de9d983b266f2f1
6
+ metadata.gz: 59626d155d94bc7f8d128071b70c1eefa6efff43f580e080e5226645c9ca669352e6b2d5126604c3508f435068638306184aa6a4187a8b336fa07e6633d31eef
7
+ data.tar.gz: 5fed598422fc98c9c458d6d46b0a987ba0cb40ea17f1681f527248f54c74ab955bd40895f80cf4eb036d587ab7c968d8f34197d29e15edca65465cb667978224
@@ -5,6 +5,7 @@ name: rake
5
5
  on:
6
6
  push:
7
7
  branches: [ master, main ]
8
+ tags: [ v* ]
8
9
  pull_request:
9
10
 
10
11
  jobs:
@@ -10,8 +10,9 @@ on:
10
10
  Next release version. Possible values: x.y.z, major, minor, patch or pre|rc|etc
11
11
  required: true
12
12
  default: 'skip'
13
- push:
14
- tags: [ v* ]
13
+ repository_dispatch:
14
+ types: [ do-release ]
15
+
15
16
 
16
17
  jobs:
17
18
  release:
data/.gitignore CHANGED
@@ -6,6 +6,7 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ /vendor/
9
10
 
10
11
  # rspec failure tracking
11
12
  .rspec_status
data/.rubocop.yml CHANGED
@@ -7,6 +7,6 @@ require: rubocop-rails
7
7
  inherit_from:
8
8
  - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
9
9
  AllCops:
10
- TargetRubyVersion: 2.5
10
+ TargetRubyVersion: 2.7
11
11
  Rails:
12
12
  Enabled: false
data/Gemfile CHANGED
@@ -3,5 +3,11 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in relaton_cie.gemspec
4
4
  gemspec
5
5
 
6
+ gem "equivalent-xml", "~> 0.6"
7
+ gem "pry-byebug"
6
8
  gem "rake", "~> 13.0"
7
9
  gem "rspec", "~> 3.0"
10
+ gem "ruby-jing"
11
+ gem "simplecov"
12
+ gem "vcr"
13
+ gem "webmock"
data/README.adoc CHANGED
@@ -99,13 +99,13 @@ item = RelatonCie::XMLParser.from_xml File.read("spec/fixtures/bibdata.xml")
99
99
 
100
100
  === Fetch data
101
101
 
102
- This gem uses the https://www.techstreet.com/cie/searches/31156444 dataset as one of data sources.
102
+ This gem uses the https://www.techstreet.com/cie/searches/31156444 dataset as one of the data sources.
103
103
 
104
104
  The method `RelatonCie::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the dataset and saves them to the `./data` folder in YAML format.
105
105
  Arguments:
106
106
 
107
107
  - `output` - folder to save documents (default './data').
108
- - `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml` (default `yaml`).
108
+ - `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
109
109
 
110
110
  [source,ruby]
111
111
  ----
@@ -522,7 +522,6 @@
522
522
  <value>tip</value>
523
523
  <value>important</value>
524
524
  <value>caution</value>
525
- <value>statement</value>
526
525
  </choice>
527
526
  </define>
528
527
  <define name="figure">
data/grammars/biblio.rng CHANGED
@@ -216,6 +216,9 @@
216
216
  <optional>
217
217
  <ref name="fullname"/>
218
218
  </optional>
219
+ <zeroOrMore>
220
+ <ref name="credential"/>
221
+ </zeroOrMore>
219
222
  <zeroOrMore>
220
223
  <ref name="affiliation"/>
221
224
  </zeroOrMore>
@@ -232,6 +235,11 @@
232
235
  <ref name="FullNameType"/>
233
236
  </element>
234
237
  </define>
238
+ <define name="credential">
239
+ <element name="credential">
240
+ <text/>
241
+ </element>
242
+ </define>
235
243
  <define name="FullNameType">
236
244
  <choice>
237
245
  <group>
@@ -305,7 +313,9 @@
305
313
  <zeroOrMore>
306
314
  <ref name="affiliationdescription"/>
307
315
  </zeroOrMore>
308
- <ref name="organization"/>
316
+ <optional>
317
+ <ref name="organization"/>
318
+ </optional>
309
319
  </element>
310
320
  </define>
311
321
  <define name="affiliationname">
@@ -1316,7 +1326,7 @@
1316
1326
  <value>commentaryOf</value>
1317
1327
  <value>hasCommentary</value>
1318
1328
  <value>related</value>
1319
- <value>complements</value>
1329
+ <value>hasComplement</value>
1320
1330
  <value>complementOf</value>
1321
1331
  <value>obsoletes</value>
1322
1332
  <value>obsoletedBy</value>
@@ -10,35 +10,64 @@ module RelatonCie
10
10
  URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
11
11
 
12
12
  def initialize(output, format)
13
- @agent = Mechanize.new
14
13
  @output = output
15
14
  @format = format
15
+ @files = []
16
+ @ext = format == "bibxml" ? "xml" : format
17
+ end
18
+
19
+ def agent
20
+ @agent ||= Mechanize.new
21
+ end
22
+
23
+ def index
24
+ @index ||= Relaton::Index.find_or_create :cie, file: "index-v1.yaml"
16
25
  end
17
26
 
18
27
  # @param hit [Nokogiri::HTML::Document]
19
28
  # @param doc [Mechanize::Page]
20
29
  # @return [Array<RelatonBib::DocumentIdentifier>]
21
- def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
22
- code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/")
30
+ def fetch_docid(hit, doc)
31
+ code, code2 = parse_code hit, doc
32
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: code, primary: true)]
33
+ if code2
34
+ type2 = code2.match(/\w+/).to_s
35
+ docid << RelatonBib::DocumentIdentifier.new(type: type2, id: code2.strip)
36
+ end
37
+ isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
38
+ docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
39
+ docid
40
+ end
41
+
42
+ def parse_code(hit, doc = nil)
43
+ code = hit.at("h3/a").text.strip.squeeze(" ").sub(/\u25b9/, "").gsub(" / ", "/")
23
44
  c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
24
45
  code = code[0...c2idx].strip if c2idx
46
+ [primary_code(code, doc), c2]
47
+ end
48
+
49
+ def primary_code(code, doc = nil)
25
50
  /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
26
- if code1.match?(/^CIE/)
27
- c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
28
- add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
29
- c += " #{add[1]} #{add[2]}" if add
30
- elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
31
- c = "CIE #{pcode.text.strip.match(/[^,]+/)}"
51
+ if code1&.match?(/^CIE/)
52
+ parse_cie_code code1, code2, doc
53
+ elsif (pcode = doc&.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
54
+ "CIE #{pcode.text.strip.match(/[^,]+/)}"
32
55
  else
33
- num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "")
34
- .gsub(/,(?=\S)/, " ")
35
- c = "CIE #{num}"
56
+ num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "").gsub(/,(?=\S)/, " ")
57
+ "CIE #{num}"
36
58
  end
37
- docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c, primary: true)]
38
- isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
39
- docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2
40
- docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
41
- docid
59
+ end
60
+
61
+ def parse_cie_code(code1, code2, doc = nil) # rubocop:disable Metrics/CyclomaticComplexity
62
+ code = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
63
+ add = doc&.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
64
+ return code unless add
65
+
66
+ "#{code} #{add[1]} #{add[2]}"
67
+ end
68
+
69
+ def fetch_docnumber(hit)
70
+ parse_code(hit).first.sub(/\w+\s/, "")
42
71
  end
43
72
 
44
73
  # @param doc [Mechanize::Page]
@@ -63,7 +92,7 @@ module RelatonCie
63
92
  # @param doc [Mechanize::Page]
64
93
  # @return [String]
65
94
  def fetch_edition(doc)
66
- doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s
95
+ doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=(st|nd|rd|th))/)&.to_s
67
96
  end
68
97
 
69
98
  # @param doc [Mechanize::Page]
@@ -132,24 +161,32 @@ module RelatonCie
132
161
  end
133
162
 
134
163
  # @param bib [RelatonCie::BibliographicItem]
135
- def write_file(bib)
164
+ def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
136
165
  id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
137
166
  file = "#{@output}/#{id.upcase}.#{@format}"
138
- # if File.exist? file
139
- # warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
140
- # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
141
- # else
142
- out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
143
- File.write file, out, encoding: "UTF-8"
144
- # end
167
+ if @files.include? file
168
+ warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
169
+ warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
170
+ else @files << file
171
+ end
172
+ index.add_or_update bib.docidentifier[0].id, file
173
+ File.write file, content(bib), encoding: "UTF-8"
174
+ end
175
+
176
+ def content(bib)
177
+ case @format
178
+ when "xml" then bib.to_xml(bibdata: true)
179
+ when "yaml" then bib.to_hash.to_yaml
180
+ when "bibxml" then bib.to_bibxml
181
+ end
145
182
  end
146
183
 
147
184
  # @param hit [Nokogiri::HTML::Element]
148
185
  def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
149
186
  url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
150
- doc = time_req { @agent.get url }
187
+ doc = time_req { agent.get url }
151
188
  item = BibliographicItem.new(
152
- type: "standard", link: fetch_link(url),
189
+ type: "standard", link: fetch_link(url), docnumber: fetch_docnumber(hit),
153
190
  docid: fetch_docid(hit, doc), title: fetch_title(doc),
154
191
  abstract: fetch_abstract(doc), date: fetch_date(doc),
155
192
  edition: fetch_edition(doc), contributor: fetch_contributor(doc),
@@ -164,10 +201,14 @@ module RelatonCie
164
201
  end
165
202
 
166
203
  def fetch(url)
167
- result = time_req { @agent.get url }
204
+ result = time_req { agent.get url }
168
205
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
169
206
  np = result.at '//a[@class="next_page"]'
170
- fetch "https://www.techstreet.com#{np[:href]}" if np
207
+ if np
208
+ fetch "https://www.techstreet.com#{np[:href]}"
209
+ else
210
+ index.save
211
+ end
171
212
  end
172
213
 
173
214
  def time_req
@@ -182,7 +223,7 @@ module RelatonCie
182
223
  t1 = Time.now
183
224
  puts "Started at: #{t1}"
184
225
 
185
- FileUtils.mkdir output
226
+ FileUtils.mkdir_p output
186
227
  new(output, format).fetch URL
187
228
 
188
229
  t2 = Time.now
@@ -47,5 +47,12 @@ module RelatonCie
47
47
  def grammar_hash
48
48
  @grammar_hash ||= ::RelatonCie.grammar_hash
49
49
  end
50
+
51
+ #
52
+ # Remove index file
53
+ #
54
+ def remove_index_file
55
+ Relaton::Index.find_or_create(:cie, url: true, file: Scrapper::INDEX_FILE).remove_file
56
+ end
50
57
  end
51
58
  end
@@ -1,13 +1,17 @@
1
1
  module RelatonCie
2
2
  module Scrapper
3
- ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-cie/master/data/".freeze
3
+ ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-cie/master/".freeze
4
+ INDEX_FILE = "index-v1.yaml".freeze
4
5
 
5
6
  class << self
6
7
  # @param code [String]
7
8
  # @return [RelatonCie::BibliographicItem]
8
9
  def scrape_page(code)
9
- url = "#{ENDPOINT}#{code.gsub(/[\/\s\-:.]/, '_').upcase}.yaml"
10
- parse_page url
10
+ index = Relaton::Index.find_or_create :cie, url: "#{ENDPOINT}index-v1.zip", file: INDEX_FILE
11
+ row = index.search(code).min_by { |r| r[:id] }
12
+ return unless row
13
+
14
+ parse_page "#{ENDPOINT}#{row[:file]}"
11
15
  rescue OpenURI::HTTPError => e
12
16
  return if e.io.status.first == "404"
13
17
 
@@ -1,3 +1,3 @@
1
1
  module RelatonCie
2
- VERSION = "1.14.0".freeze
2
+ VERSION = "1.14.1".freeze
3
3
  end
data/lib/relaton_cie.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  require "nokogiri"
2
2
  require "open-uri"
3
+ # require "parslet"
4
+ require "relaton/index"
3
5
  require "relaton_bib"
6
+ # require "relaton_bib/name_parser"
4
7
  require "relaton_cie/version"
5
8
  require "relaton_cie/bibliographic_item"
6
9
  require "relaton_cie/cie_bibliography"
data/relaton_cie.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
16
16
  DESCRIPTION
17
17
  spec.homepage = "https://github.com/metanorma/relaton-cie"
18
18
  spec.license = "BSD-2-Clause"
19
- spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
19
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
20
20
 
21
21
  spec.metadata["homepage_uri"] = spec.homepage
22
22
 
@@ -29,13 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ["lib"]
31
31
 
32
- spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
- spec.add_development_dependency "pry-byebug"
34
- spec.add_development_dependency "ruby-jing"
35
- spec.add_development_dependency "simplecov"
36
- spec.add_development_dependency "vcr"
37
- spec.add_development_dependency "webmock"
38
-
39
32
  spec.add_dependency "mechanize", "~> 2.8.0"
33
+ spec.add_dependency "parslet", "~> 2.0.0"
40
34
  spec.add_dependency "relaton-bib", "~> 1.14.0"
35
+ spec.add_dependency "relaton-index", "~> 0.2.0"
41
36
  end
metadata CHANGED
@@ -1,127 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.14.0
4
+ version: 1.14.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-03 00:00:00.000000000 Z
11
+ date: 2023-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: equivalent-xml
14
+ name: mechanize
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0.6'
20
- type: :development
19
+ version: 2.8.0
20
+ type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0.6'
27
- - !ruby/object:Gem::Dependency
28
- name: pry-byebug
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: ruby-jing
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: simplecov
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: vcr
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
26
+ version: 2.8.0
83
27
  - !ruby/object:Gem::Dependency
84
- name: webmock
28
+ name: parslet
85
29
  requirement: !ruby/object:Gem::Requirement
86
30
  requirements:
87
- - - ">="
31
+ - - "~>"
88
32
  - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
33
+ version: 2.0.0
34
+ type: :runtime
91
35
  prerelease: false
92
36
  version_requirements: !ruby/object:Gem::Requirement
93
37
  requirements:
94
- - - ">="
38
+ - - "~>"
95
39
  - !ruby/object:Gem::Version
96
- version: '0'
40
+ version: 2.0.0
97
41
  - !ruby/object:Gem::Dependency
98
- name: mechanize
42
+ name: relaton-bib
99
43
  requirement: !ruby/object:Gem::Requirement
100
44
  requirements:
101
45
  - - "~>"
102
46
  - !ruby/object:Gem::Version
103
- version: 2.8.0
47
+ version: 1.14.0
104
48
  type: :runtime
105
49
  prerelease: false
106
50
  version_requirements: !ruby/object:Gem::Requirement
107
51
  requirements:
108
52
  - - "~>"
109
53
  - !ruby/object:Gem::Version
110
- version: 2.8.0
54
+ version: 1.14.0
111
55
  - !ruby/object:Gem::Dependency
112
- name: relaton-bib
56
+ name: relaton-index
113
57
  requirement: !ruby/object:Gem::Requirement
114
58
  requirements:
115
59
  - - "~>"
116
60
  - !ruby/object:Gem::Version
117
- version: 1.14.0
61
+ version: 0.2.0
118
62
  type: :runtime
119
63
  prerelease: false
120
64
  version_requirements: !ruby/object:Gem::Requirement
121
65
  requirements:
122
66
  - - "~>"
123
67
  - !ruby/object:Gem::Version
124
- version: 1.14.0
68
+ version: 0.2.0
125
69
  description: "RelatonEcma: retrieve CIE Standards for bibliographic use \nusing the
126
70
  BibliographicItem model.\n"
127
71
  email:
@@ -162,7 +106,7 @@ licenses:
162
106
  - BSD-2-Clause
163
107
  metadata:
164
108
  homepage_uri: https://github.com/metanorma/relaton-cie
165
- post_install_message:
109
+ post_install_message:
166
110
  rdoc_options: []
167
111
  require_paths:
168
112
  - lib
@@ -170,15 +114,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
170
114
  requirements:
171
115
  - - ">="
172
116
  - !ruby/object:Gem::Version
173
- version: 2.5.0
117
+ version: 2.7.0
174
118
  required_rubygems_version: !ruby/object:Gem::Requirement
175
119
  requirements:
176
120
  - - ">="
177
121
  - !ruby/object:Gem::Version
178
122
  version: '0'
179
123
  requirements: []
180
- rubygems_version: 3.2.3
181
- signing_key:
124
+ rubygems_version: 3.3.26
125
+ signing_key:
182
126
  specification_version: 4
183
127
  summary: 'RelatonEcma: retrieve CIE Standards for bibliographic use using the BibliographicItem
184
128
  model.'