relaton-cie 1.7.pre1 → 1.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/grammars/reqt.rng CHANGED
@@ -30,15 +30,34 @@
30
30
  <data type="boolean"/>
31
31
  </attribute>
32
32
  </optional>
33
+ <optional>
34
+ <attribute name="number"/>
35
+ </optional>
33
36
  <optional>
34
37
  <attribute name="subsequence"/>
35
38
  </optional>
39
+ <optional>
40
+ <attribute name="keep-with-next">
41
+ <data type="boolean"/>
42
+ </attribute>
43
+ </optional>
44
+ <optional>
45
+ <attribute name="keep-lines-together">
46
+ <data type="boolean"/>
47
+ </attribute>
48
+ </optional>
36
49
  <attribute name="id">
37
50
  <data type="ID"/>
38
51
  </attribute>
39
52
  <optional>
40
53
  <attribute name="filename"/>
41
54
  </optional>
55
+ <optional>
56
+ <attribute name="model"/>
57
+ </optional>
58
+ <optional>
59
+ <attribute name="type"/>
60
+ </optional>
42
61
  <optional>
43
62
  <ref name="reqtitle"/>
44
63
  </optional>
@@ -48,9 +67,9 @@
48
67
  <optional>
49
68
  <ref name="subject"/>
50
69
  </optional>
51
- <optional>
70
+ <zeroOrMore>
52
71
  <ref name="reqinherit"/>
53
- </optional>
72
+ </zeroOrMore>
54
73
  <zeroOrMore>
55
74
  <ref name="classification"/>
56
75
  </zeroOrMore>
@@ -135,6 +154,16 @@
135
154
  <data type="boolean"/>
136
155
  </attribute>
137
156
  </optional>
157
+ <optional>
158
+ <attribute name="keep-with-next">
159
+ <data type="boolean"/>
160
+ </attribute>
161
+ </optional>
162
+ <optional>
163
+ <attribute name="keep-lines-together">
164
+ <data type="boolean"/>
165
+ </attribute>
166
+ </optional>
138
167
  <oneOrMore>
139
168
  <ref name="BasicBlock"/>
140
169
  </oneOrMore>
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "English"
4
+ require "fileutils"
5
+ require "mechanize"
6
+ require "relaton_bib"
7
+
8
+ module RelatonCie
9
+ class DataFetcher
10
+ URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
11
+
12
+ def initialize(output, format)
13
+ @agent = Mechanize.new
14
+ @output = output
15
+ @format = format
16
+ end
17
+
18
+ # @param hit [Nokogiri::HTML::Document]
19
+ # @param doc [Mechanize::Page]
20
+ # @return [Array<RelatonBib::DocumentIdentifier>]
21
+ def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
22
+ code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/")
23
+ c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
24
+ code = code[0...c2idx].strip if c2idx
25
+ /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
26
+ if code1.match?(/^CIE/)
27
+ c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
28
+ add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
29
+ c += " #{add[1]} #{add[2]}" if add
30
+ elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
31
+ c = "CIE #{pcode.text.strip.match(/[^,]+/)}"
32
+ else
33
+ num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "")
34
+ .gsub(/,(?=\S)/, " ")
35
+ c = "CIE #{num}"
36
+ end
37
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c)]
38
+ isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
39
+ docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2
40
+ docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
41
+ docid
42
+ end
43
+
44
+ # @param doc [Mechanize::Page]
45
+ # @return [RelatonBib::TypedTitleStringCollection, Array]
46
+ def fetch_title(doc)
47
+ t = doc.at("//hgroup/h2", "//hgroup/h1")
48
+ return [] unless t
49
+
50
+ RelatonBib::TypedTitleString.from_string t.text.strip
51
+ end
52
+
53
+ # @param doc [Mechanize::Page]
54
+ # @return [Array<RelatonBib::BibliographicDate>]
55
+ def fetch_date(doc)
56
+ doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d|
57
+ pd = d.text.strip
58
+ on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
59
+ RelatonBib::BibliographicDate.new(type: "published", on: on)
60
+ end
61
+ end
62
+
63
+ # @param doc [Mechanize::Page]
64
+ # @return [String]
65
+ def fetch_edition(doc)
66
+ doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s
67
+ end
68
+
69
+ # @param doc [Mechanize::Page]
70
+ # @return [Array<Hash>]
71
+ def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
72
+ doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
73
+ ref = rel.at("a")
74
+ url = "https://www.techstreet.com#{ref[:href]}"
75
+ title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text
76
+ did = ref.at("h3").text
77
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did)]
78
+ on = ref.at("p/time")
79
+ date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])]
80
+ link = [RelatonBib::TypedUri.new(type: "src", content: url)]
81
+ bibitem = RelatonBib::BibliographicItem.new docid: docid, title: title, link: link, date: date
82
+ type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
83
+ { type: type, bibitem: bibitem }
84
+ end
85
+ end
86
+
87
+ # @param url [String]
88
+ # @return [Array<RelatonBib::TypedUri>]
89
+ def fetch_link(url)
90
+ [RelatonBib::TypedUri.new(type: "src", content: url)]
91
+ end
92
+
93
+ # @param doc [Mechanize::Page]
94
+ # @return [Array<RelatonBib::FormattedString>]
95
+ def fetch_abstract(doc)
96
+ content = doc.at('//div[contains(@class,"description")]')&.text&.strip
97
+ return [] if content.nil? || content.empty?
98
+
99
+ [RelatonBib::FormattedString.new(content: content, language: "en",
100
+ script: "Latn")]
101
+ end
102
+
103
+ # @param doc [Mechanize::Page]
104
+ # @return [Array<Hash>]
105
+ def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
106
+ authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text
107
+ contribs = []
108
+ until authors.empty?
109
+ /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
110
+ (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
111
+ (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))?
112
+ (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
113
+ (?:(?:,\s?|\s|\.|(?<=\s))(?:and\s)?)?/x =~ authors
114
+ raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO
115
+
116
+ authors.sub! $LAST_MATCH_INFO.to_s, ""
117
+ sname = [sname1, sname2].compact.join " "
118
+ surname = RelatonBib::LocalizedString.new sname, "en", "Latn"
119
+ initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int|
120
+ RelatonBib::LocalizedString.new(int.strip, "en", "Latn")
121
+ end
122
+ forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : []
123
+ fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial
124
+ person = RelatonBib::Person.new name: fullname
125
+ contribs << { entity: person, role: [{ type: "author" }] }
126
+ end
127
+ org = RelatonBib::Organization.new(
128
+ name: "Commission Internationale de L'Eclairage", abbreviation: "CIE",
129
+ url: "cie.co.at"
130
+ )
131
+ contribs << { entity: org, role: [{ type: "publisher" }] }
132
+ end
133
+
134
+ # @param bib [RelatonItu::ItuBibliographicItem]
135
+ def write_file(bib)
136
+ id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
137
+ file = "#{@output}/#{id.upcase}.#{@format}"
138
+ # if File.exist? file
139
+ # warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
140
+ # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
141
+ # else
142
+ out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
143
+ File.write file, out, encoding: "UTF-8"
144
+ # end
145
+ end
146
+
147
+ # @param hit [Nokogiri::HTML::Element]
148
+ def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
149
+ url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
150
+ doc = time_req { @agent.get url }
151
+ item = RelatonBib::BibliographicItem.new(
152
+ type: "standard", docid: fetch_docid(hit, doc), title: fetch_title(doc),
153
+ link: fetch_link(url), abstract: fetch_abstract(doc),
154
+ date: fetch_date(doc), edition: fetch_edition(doc),
155
+ contributor: fetch_contributor(doc), relation: fetch_relation(doc),
156
+ language: ["en"], script: ["Latn"], doctype: "document"
157
+ )
158
+ write_file item
159
+ rescue StandardError => e
160
+ warn "Document: #{url}"
161
+ warn e.message
162
+ warn e.backtrace
163
+ end
164
+
165
+ def fetch(url)
166
+ result = time_req { @agent.get url }
167
+ result.xpath("//li[@data-product]").each { |hit| parse_page hit }
168
+ np = result.at '//a[@class="next_page"]'
169
+ fetch "https://www.techstreet.com#{np[:href]}" if np
170
+ end
171
+
172
+ def time_req
173
+ t1 = Time.now
174
+ result = yield
175
+ t = 1 - (Time.now - t1)
176
+ sleep t if t.positive?
177
+ result
178
+ end
179
+
180
+ def self.fetch(output: "data", format: "yaml")
181
+ t1 = Time.now
182
+ puts "Started at: #{t1}"
183
+
184
+ FileUtils.mkdir output unless Dir.exist? output
185
+ new(output, format).fetch URL
186
+
187
+ t2 = Time.now
188
+ puts "Stopped at: #{t2}"
189
+ puts "Done in: #{(t2 - t1).round} sec."
190
+ end
191
+ end
192
+ end
@@ -2,11 +2,12 @@ require "relaton/processor"
2
2
 
3
3
  module RelatonCie
4
4
  class Processor < Relaton::Processor
5
- def initialize
5
+ def initialize # rubocop:disable Lint/MissingSuper
6
6
  @short = :relaton_cie
7
7
  @prefix = "CIE"
8
8
  @defaultprefix = /^CIE(-|\s)/
9
9
  @idtype = "CIE"
10
+ @datasets = %w[cie-techstreet]
10
11
  end
11
12
 
12
13
  # @param code [String]
@@ -17,6 +18,18 @@ module RelatonCie
17
18
  ::RelatonCie::CieBibliography.get(code, date, opts)
18
19
  end
19
20
 
21
+ #
22
+ # Fetch all the docukents from a source
23
+ #
24
+ # @param [String] _source source name
25
+ # @param [Hash] opts
26
+ # @option opts [String] :output directory to output documents
27
+ # @option opts [String] :format
28
+ #
29
+ def fetch_data(_source, opts)
30
+ DataFetcher.fetch(**opts)
31
+ end
32
+
20
33
  # @param xml [String]
21
34
  # @return [RelatonBib::BibliographicItem]
22
35
  def from_xml(xml)
@@ -26,8 +39,7 @@ module RelatonCie
26
39
  # @param hash [Hash]
27
40
  # @return [RelatonBib::BibliographicItem]
28
41
  def hash_to_bib(hash)
29
- item_hash = ::RelatonBib::HashConverter.hash_to_bib(hash)
30
- ::RelatonBib::BibliographicItem.new item_hash
42
+ ::RelatonBib::BibliographicItem.from_hash hash
31
43
  end
32
44
 
33
45
  # Returns hash of XML grammar
@@ -6,7 +6,7 @@ module RelatonCie
6
6
  # @param code [String]
7
7
  # @return [RelatonBib::BibliographicItem]
8
8
  def scrape_page(code)
9
- url = "#{ENDPOINT}#{code.gsub(/[\/\s\-:\.]/, '_').upcase}.yaml"
9
+ url = "#{ENDPOINT}#{code.gsub(/[\/\s\-:.]/, '_').upcase}.yaml"
10
10
  parse_page url
11
11
  rescue OpenURI::HTTPError => e
12
12
  return if e.io.status.first == "404"
@@ -21,7 +21,7 @@ module RelatonCie
21
21
  def parse_page(url)
22
22
  doc = OpenURI.open_uri url
23
23
  bib_hash = RelatonBib::HashConverter.hash_to_bib YAML.safe_load(doc)
24
- RelatonBib::BibliographicItem.new **bib_hash
24
+ RelatonBib::BibliographicItem.new(**bib_hash)
25
25
  end
26
26
  end
27
27
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonCie
2
- VERSION = "1.7.pre1".freeze
2
+ VERSION = "1.9.1".freeze
3
3
  end
data/lib/relaton_cie.rb CHANGED
@@ -4,6 +4,7 @@ require "relaton_bib"
4
4
  require "relaton_cie/version"
5
5
  require "relaton_cie/cie_bibliography"
6
6
  require "relaton_cie/scrapper"
7
+ require "relaton_cie/data_fetcher"
7
8
 
8
9
  module RelatonCie
9
10
  # Returns hash of XML reammar
data/relaton_cie.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
16
16
  DESCRIPTION
17
17
  spec.homepage = "https://github.com/metanorma/relaton-cie"
18
18
  spec.license = "BSD-2-Clause"
19
- spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
19
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
20
20
 
21
21
  spec.metadata["homepage_uri"] = spec.homepage
22
22
 
@@ -29,15 +29,13 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ["lib"]
31
31
 
32
- # spec.add_development_dependency "debase"
33
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
34
33
  spec.add_development_dependency "pry-byebug"
35
- spec.add_development_dependency "rake", "~> 10.0"
36
- # spec.add_development_dependency "ruby-debug-ide"
37
34
  spec.add_development_dependency "ruby-jing"
38
35
  spec.add_development_dependency "simplecov"
39
36
  spec.add_development_dependency "vcr"
40
37
  spec.add_development_dependency "webmock"
41
38
 
42
- spec.add_dependency "relaton-bib", "~> 1.7.0"
39
+ spec.add_dependency "mechanize", "~> 2.8.0"
40
+ spec.add_dependency "relaton-bib", "~> 1.9.0"
43
41
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.7.pre1
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-03-17 00:00:00.000000000 Z
11
+ date: 2021-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '10.0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '10.0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: ruby-jing
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -108,20 +94,34 @@ dependencies:
108
94
  - - ">="
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mechanize
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 2.8.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 2.8.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: relaton-bib
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 1.7.0
117
+ version: 1.9.0
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 1.7.0
124
+ version: 1.9.0
125
125
  description: "RelatonEcma: retrieve CIE Standards for bibliographic use \nusing the
126
126
  BibliographicItem model.\n"
127
127
  email:
@@ -139,6 +139,7 @@ files:
139
139
  - README.adoc
140
140
  - Rakefile
141
141
  - bin/console
142
+ - bin/rspec
142
143
  - bin/setup
143
144
  - grammars/basicdoc.rng
144
145
  - grammars/biblio.rng
@@ -146,6 +147,7 @@ files:
146
147
  - grammars/reqt.rng
147
148
  - lib/relaton_cie.rb
148
149
  - lib/relaton_cie/cie_bibliography.rb
150
+ - lib/relaton_cie/data_fetcher.rb
149
151
  - lib/relaton_cie/processor.rb
150
152
  - lib/relaton_cie/scrapper.rb
151
153
  - lib/relaton_cie/version.rb
@@ -163,14 +165,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
163
165
  requirements:
164
166
  - - ">="
165
167
  - !ruby/object:Gem::Version
166
- version: 2.4.0
168
+ version: 2.5.0
167
169
  required_rubygems_version: !ruby/object:Gem::Requirement
168
170
  requirements:
169
- - - ">"
171
+ - - ">="
170
172
  - !ruby/object:Gem::Version
171
- version: 1.3.1
173
+ version: '0'
172
174
  requirements: []
173
- rubygems_version: 3.0.6
175
+ rubygems_version: 3.2.3
174
176
  signing_key:
175
177
  specification_version: 4
176
178
  summary: 'RelatonEcma: retrieve CIE Standards for bibliographic use using the BibliographicItem