relaton-cie 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a32430b5610333200d49320b428edb59cdd4d05c12c11aa6be6320c88058fe81
4
- data.tar.gz: 938409879b49ad789a422391356e4548413735dd2a1154c6203425b6e6575eb4
3
+ metadata.gz: 2d5d26de612f4a997641b1f7325f48624e0f9d349d43373236bb10ec27115fb8
4
+ data.tar.gz: 31bec1cd855722cff7d70b57106b78182d98683c8c98984027827cf1f8ae9579
5
5
  SHA512:
6
- metadata.gz: 5e143606dc83ddd5a7c128edb77e0264127accfe466007711865517e6212c9aae9c95892ccc5b170743dfba9d7a37879c504ed6e4e4293aa75498c9cfdae281e
7
- data.tar.gz: 85c9aacec26c847278d060979ba7ac56619817f02065f890783fdcc0147b0684cb2ea905757270470b9048e8dca5f9ddc2b6aa3e4771f130b1c5b358401f21ea
6
+ metadata.gz: 63027b7c118820397e616d083cd6aed0748fc9eb13efa3d12428a7441bddfb9f31b9375825cea31825cccc3ba518c9ffc39c0e4984299cdf49c519c5dc1d979a
7
+ data.tar.gz: e01df911e2fe2d1974bfc5004bbbf49ffb336872c7bead4722997050e38c141ef349d41f5c92f31d7acebdaa4c02dcca1cde012983c79533255687e77f31d298
data/Gemfile CHANGED
@@ -3,5 +3,5 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in relaton_cie.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem "rake", "~> 13.0"
7
7
  gem "rspec", "~> 3.0"
data/bin/rspec ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("rspec-core", "rspec")
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "English"
4
+ require "fileutils"
5
+ require "mechanize"
6
+ require "relaton_bib"
7
+
8
+ module RelatonCie
9
+ class DataFetcher
10
+ URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
11
+
12
+ def initialize(output, format)
13
+ @agent = Mechanize.new
14
+ @output = output
15
+ @format = format
16
+ end
17
+
18
+ # @param hit [Nokogiri::HTML::Document]
19
+ # @param doc [Mechanize::Page]
20
+ # @return [Array<RelatonBib::DocumentIdentifier>]
21
+ def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
22
+ code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/")
23
+ c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
24
+ code = code[0...c2idx].strip if c2idx
25
+ /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
26
+ if code1.match?(/^CIE/)
27
+ c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
28
+ add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
29
+ c += " #{add[1]} #{add[2]}" if add
30
+ elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
31
+ c = "CIE #{pcode.text.strip.match(/[^,]+/)}"
32
+ else
33
+ num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "")
34
+ .gsub(/,(?=\S)/, " ")
35
+ c = "CIE #{num}"
36
+ end
37
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c)]
38
+ isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
39
+ docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2
40
+ docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
41
+ docid
42
+ end
43
+
44
+ # @param doc [Mechanize::Page]
45
+ # @return [RelatonBib::TypedTitleStringCollection, Array]
46
+ def fetch_title(doc)
47
+ t = doc.at("//hgroup/h2", "//hgroup/h1")
48
+ return [] unless t
49
+
50
+ RelatonBib::TypedTitleString.from_string t.text.strip
51
+ end
52
+
53
+ # @param doc [Mechanize::Page]
54
+ # @return [Array<RelatonBib::BibliographicDate>]
55
+ def fetch_date(doc)
56
+ doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d|
57
+ pd = d.text.strip
58
+ on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
59
+ RelatonBib::BibliographicDate.new(type: "published", on: on)
60
+ end
61
+ end
62
+
63
+ # @param doc [Mechanize::Page]
64
+ # @return [String]
65
+ def fetch_edition(doc)
66
+ doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s
67
+ end
68
+
69
+ # @param doc [Mechanize::Page]
70
+ # @return [Array<Hash>]
71
+ def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
72
+ doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
73
+ ref = rel.at("a")
74
+ url = "https://www.techstreet.com#{ref[:href]}"
75
+ title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text
76
+ did = ref.at("h3").text
77
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did)]
78
+ on = ref.at("p/time")
79
+ date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])]
80
+ link = [RelatonBib::TypedUri.new(type: "src", content: url)]
81
+ bibitem = RelatonBib::BibliographicItem.new docid: docid, title: title, link: link, date: date
82
+ type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
83
+ { type: type, bibitem: bibitem }
84
+ end
85
+ end
86
+
87
+ # @param url [String]
88
+ # @return [Array<RelatonBib::TypedUri>]
89
+ def fetch_link(url)
90
+ [RelatonBib::TypedUri.new(type: "src", content: url)]
91
+ end
92
+
93
+ # @param doc [Mechanize::Page]
94
+ # @return [Array<RelatonBib::FormattedString>]
95
+ def fetch_abstract(doc)
96
+ content = doc.at('//div[contains(@class,"description")]')&.text&.strip
97
+ return [] if content.nil? || content.empty?
98
+
99
+ [RelatonBib::FormattedString.new(content: content, language: "en",
100
+ script: "Latn")]
101
+ end
102
+
103
+ # @param doc [Mechanize::Page]
104
+ # @return [Array<Hash>]
105
+ def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
106
+ authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text
107
+ contribs = []
108
+ until authors.empty?
109
+ /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
110
+ (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
111
+ (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))?
112
+ (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
113
+ (?:(?:,\s?|\s|\.|(?<=\s))(?:and\s)?)?/x =~ authors
114
+ raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO
115
+
116
+ authors.sub! $LAST_MATCH_INFO.to_s, ""
117
+ sname = [sname1, sname2].compact.join " "
118
+ surname = RelatonBib::LocalizedString.new sname, "en", "Latn"
119
+ initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int|
120
+ RelatonBib::LocalizedString.new(int.strip, "en", "Latn")
121
+ end
122
+ forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : []
123
+ fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial
124
+ person = RelatonBib::Person.new name: fullname
125
+ contribs << { entity: person, role: [{ type: "author" }] }
126
+ end
127
+ org = RelatonBib::Organization.new(
128
+ name: "Commission Internationale de L'Eclairage", abbreviation: "CIE",
129
+ url: "cie.co.at"
130
+ )
131
+ contribs << { entity: org, role: [{ type: "publisher" }] }
132
+ end
133
+
134
+ # @param bib [RelatonItu::ItuBibliographicItem]
135
+ def write_file(bib)
136
+ id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
137
+ file = "#{@output}/#{id.upcase}.#{@format}"
138
+ # if File.exist? file
139
+ # warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
140
+ # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
141
+ # else
142
+ out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
143
+ File.write file, out, encoding: "UTF-8"
144
+ # end
145
+ end
146
+
147
+ # @param hit [Nokogiri::HTML::Element]
148
+ def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
149
+ url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
150
+ doc = time_req { @agent.get url }
151
+ item = RelatonBib::BibliographicItem.new(
152
+ type: "standard", docid: fetch_docid(hit, doc), title: fetch_title(doc),
153
+ link: fetch_link(url), abstract: fetch_abstract(doc),
154
+ date: fetch_date(doc), edition: fetch_edition(doc),
155
+ contributor: fetch_contributor(doc), relation: fetch_relation(doc),
156
+ language: ["en"], script: ["Latn"], doctype: "document"
157
+ )
158
+ write_file item
159
+ rescue StandardError => e
160
+ warn "Document: #{url}"
161
+ warn e.message
162
+ warn e.backtrace
163
+ end
164
+
165
+ def fetch(url)
166
+ result = time_req { @agent.get url }
167
+ result.xpath("//li[@data-product]").each { |hit| parse_page hit }
168
+ np = result.at '//a[@class="next_page"]'
169
+ fetch "https://www.techstreet.com#{np[:href]}" if np
170
+ end
171
+
172
+ def time_req
173
+ t1 = Time.now
174
+ result = yield
175
+ t = 1 - (Time.now - t1)
176
+ sleep t if t.positive?
177
+ result
178
+ end
179
+
180
+ def self.fetch(output: "data", format: "yaml")
181
+ t1 = Time.now
182
+ puts "Started at: #{t1}"
183
+
184
+ FileUtils.mkdir output unless Dir.exist? output
185
+ new(output, format).fetch URL
186
+
187
+ t2 = Time.now
188
+ puts "Stopped at: #{t2}"
189
+ puts "Done in: #{(t2 - t1).round} sec."
190
+ end
191
+ end
192
+ end
@@ -2,11 +2,12 @@ require "relaton/processor"
2
2
 
3
3
  module RelatonCie
4
4
  class Processor < Relaton::Processor
5
- def initialize
5
+ def initialize # rubocop:disable Lint/MissingSuper
6
6
  @short = :relaton_cie
7
7
  @prefix = "CIE"
8
8
  @defaultprefix = /^CIE(-|\s)/
9
9
  @idtype = "CIE"
10
+ @datasets = %w[cie-techstreet]
10
11
  end
11
12
 
12
13
  # @param code [String]
@@ -17,6 +18,18 @@ module RelatonCie
17
18
  ::RelatonCie::CieBibliography.get(code, date, opts)
18
19
  end
19
20
 
21
+ #
22
+ # Fetch all the docukents from a source
23
+ #
24
+ # @param [String] _source source name
25
+ # @param [Hash] opts
26
+ # @option opts [String] :output directory to output documents
27
+ # @option opts [String] :format
28
+ #
29
+ def fetch_data(_source, opts)
30
+ DataFetcher.fetch(**opts)
31
+ end
32
+
20
33
  # @param xml [String]
21
34
  # @return [RelatonBib::BibliographicItem]
22
35
  def from_xml(xml)
@@ -1,3 +1,3 @@
1
1
  module RelatonCie
2
- VERSION = "1.9.0".freeze
2
+ VERSION = "1.9.1".freeze
3
3
  end
data/lib/relaton_cie.rb CHANGED
@@ -4,6 +4,7 @@ require "relaton_bib"
4
4
  require "relaton_cie/version"
5
5
  require "relaton_cie/cie_bibliography"
6
6
  require "relaton_cie/scrapper"
7
+ require "relaton_cie/data_fetcher"
7
8
 
8
9
  module RelatonCie
9
10
  # Returns hash of XML reammar
data/relaton_cie.gemspec CHANGED
@@ -31,11 +31,11 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
33
  spec.add_development_dependency "pry-byebug"
34
- spec.add_development_dependency "rake", "~> 10.0"
35
34
  spec.add_development_dependency "ruby-jing"
36
35
  spec.add_development_dependency "simplecov"
37
36
  spec.add_development_dependency "vcr"
38
37
  spec.add_development_dependency "webmock"
39
38
 
39
+ spec.add_dependency "mechanize", "~> 2.8.0"
40
40
  spec.add_dependency "relaton-bib", "~> 1.9.0"
41
41
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-26 00:00:00.000000000 Z
11
+ date: 2021-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '10.0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '10.0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: ruby-jing
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +94,20 @@ dependencies:
108
94
  - - ">="
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mechanize
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 2.8.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 2.8.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: relaton-bib
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -139,6 +139,7 @@ files:
139
139
  - README.adoc
140
140
  - Rakefile
141
141
  - bin/console
142
+ - bin/rspec
142
143
  - bin/setup
143
144
  - grammars/basicdoc.rng
144
145
  - grammars/biblio.rng
@@ -146,6 +147,7 @@ files:
146
147
  - grammars/reqt.rng
147
148
  - lib/relaton_cie.rb
148
149
  - lib/relaton_cie/cie_bibliography.rb
150
+ - lib/relaton_cie/data_fetcher.rb
149
151
  - lib/relaton_cie/processor.rb
150
152
  - lib/relaton_cie/scrapper.rb
151
153
  - lib/relaton_cie/version.rb