relaton-cie 1.9.0 → 1.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a32430b5610333200d49320b428edb59cdd4d05c12c11aa6be6320c88058fe81
4
- data.tar.gz: 938409879b49ad789a422391356e4548413735dd2a1154c6203425b6e6575eb4
3
+ metadata.gz: 2d5d26de612f4a997641b1f7325f48624e0f9d349d43373236bb10ec27115fb8
4
+ data.tar.gz: 31bec1cd855722cff7d70b57106b78182d98683c8c98984027827cf1f8ae9579
5
5
  SHA512:
6
- metadata.gz: 5e143606dc83ddd5a7c128edb77e0264127accfe466007711865517e6212c9aae9c95892ccc5b170743dfba9d7a37879c504ed6e4e4293aa75498c9cfdae281e
7
- data.tar.gz: 85c9aacec26c847278d060979ba7ac56619817f02065f890783fdcc0147b0684cb2ea905757270470b9048e8dca5f9ddc2b6aa3e4771f130b1c5b358401f21ea
6
+ metadata.gz: 63027b7c118820397e616d083cd6aed0748fc9eb13efa3d12428a7441bddfb9f31b9375825cea31825cccc3ba518c9ffc39c0e4984299cdf49c519c5dc1d979a
7
+ data.tar.gz: e01df911e2fe2d1974bfc5004bbbf49ffb336872c7bead4722997050e38c141ef349d41f5c92f31d7acebdaa4c02dcca1cde012983c79533255687e77f31d298
data/Gemfile CHANGED
@@ -3,5 +3,5 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in relaton_cie.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem "rake", "~> 13.0"
7
7
  gem "rspec", "~> 3.0"
data/bin/rspec ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("rspec-core", "rspec")
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "English"
4
+ require "fileutils"
5
+ require "mechanize"
6
+ require "relaton_bib"
7
+
8
+ module RelatonCie
9
+ class DataFetcher
10
+ URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
11
+
12
+ def initialize(output, format)
13
+ @agent = Mechanize.new
14
+ @output = output
15
+ @format = format
16
+ end
17
+
18
+ # @param hit [Nokogiri::HTML::Document]
19
+ # @param doc [Mechanize::Page]
20
+ # @return [Array<RelatonBib::DocumentIdentifier>]
21
+ def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
22
+ code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/")
23
+ c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
24
+ code = code[0...c2idx].strip if c2idx
25
+ /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
26
+ if code1.match?(/^CIE/)
27
+ c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
28
+ add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
29
+ c += " #{add[1]} #{add[2]}" if add
30
+ elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
31
+ c = "CIE #{pcode.text.strip.match(/[^,]+/)}"
32
+ else
33
+ num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "")
34
+ .gsub(/,(?=\S)/, " ")
35
+ c = "CIE #{num}"
36
+ end
37
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c)]
38
+ isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
39
+ docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2
40
+ docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
41
+ docid
42
+ end
43
+
44
+ # @param doc [Mechanize::Page]
45
+ # @return [RelatonBib::TypedTitleStringCollection, Array]
46
+ def fetch_title(doc)
47
+ t = doc.at("//hgroup/h2", "//hgroup/h1")
48
+ return [] unless t
49
+
50
+ RelatonBib::TypedTitleString.from_string t.text.strip
51
+ end
52
+
53
+ # @param doc [Mechanize::Page]
54
+ # @return [Array<RelatonBib::BibliographicDate>]
55
+ def fetch_date(doc)
56
+ doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d|
57
+ pd = d.text.strip
58
+ on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
59
+ RelatonBib::BibliographicDate.new(type: "published", on: on)
60
+ end
61
+ end
62
+
63
+ # @param doc [Mechanize::Page]
64
+ # @return [String]
65
+ def fetch_edition(doc)
66
+ doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s
67
+ end
68
+
69
+ # @param doc [Mechanize::Page]
70
+ # @return [Array<Hash>]
71
+ def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
72
+ doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
73
+ ref = rel.at("a")
74
+ url = "https://www.techstreet.com#{ref[:href]}"
75
+ title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text
76
+ did = ref.at("h3").text
77
+ docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did)]
78
+ on = ref.at("p/time")
79
+ date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])]
80
+ link = [RelatonBib::TypedUri.new(type: "src", content: url)]
81
+ bibitem = RelatonBib::BibliographicItem.new docid: docid, title: title, link: link, date: date
82
+ type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
83
+ { type: type, bibitem: bibitem }
84
+ end
85
+ end
86
+
87
+ # @param url [String]
88
+ # @return [Array<RelatonBib::TypedUri>]
89
+ def fetch_link(url)
90
+ [RelatonBib::TypedUri.new(type: "src", content: url)]
91
+ end
92
+
93
+ # @param doc [Mechanize::Page]
94
+ # @return [Array<RelatonBib::FormattedString>]
95
+ def fetch_abstract(doc)
96
+ content = doc.at('//div[contains(@class,"description")]')&.text&.strip
97
+ return [] if content.nil? || content.empty?
98
+
99
+ [RelatonBib::FormattedString.new(content: content, language: "en",
100
+ script: "Latn")]
101
+ end
102
+
103
+ # @param doc [Mechanize::Page]
104
+ # @return [Array<Hash>]
105
+ def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
106
+ authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text
107
+ contribs = []
108
+ until authors.empty?
109
+ /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
110
+ (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
111
+ (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))?
112
+ (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
113
+ (?:(?:,\s?|\s|\.|(?<=\s))(?:and\s)?)?/x =~ authors
114
+ raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO
115
+
116
+ authors.sub! $LAST_MATCH_INFO.to_s, ""
117
+ sname = [sname1, sname2].compact.join " "
118
+ surname = RelatonBib::LocalizedString.new sname, "en", "Latn"
119
+ initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int|
120
+ RelatonBib::LocalizedString.new(int.strip, "en", "Latn")
121
+ end
122
+ forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : []
123
+ fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial
124
+ person = RelatonBib::Person.new name: fullname
125
+ contribs << { entity: person, role: [{ type: "author" }] }
126
+ end
127
+ org = RelatonBib::Organization.new(
128
+ name: "Commission Internationale de L'Eclairage", abbreviation: "CIE",
129
+ url: "cie.co.at"
130
+ )
131
+ contribs << { entity: org, role: [{ type: "publisher" }] }
132
+ end
133
+
134
+ # @param bib [RelatonItu::ItuBibliographicItem]
135
+ def write_file(bib)
136
+ id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
137
+ file = "#{@output}/#{id.upcase}.#{@format}"
138
+ # if File.exist? file
139
+ # warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
140
+ # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
141
+ # else
142
+ out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
143
+ File.write file, out, encoding: "UTF-8"
144
+ # end
145
+ end
146
+
147
+ # @param hit [Nokogiri::HTML::Element]
148
+ def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
149
+ url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
150
+ doc = time_req { @agent.get url }
151
+ item = RelatonBib::BibliographicItem.new(
152
+ type: "standard", docid: fetch_docid(hit, doc), title: fetch_title(doc),
153
+ link: fetch_link(url), abstract: fetch_abstract(doc),
154
+ date: fetch_date(doc), edition: fetch_edition(doc),
155
+ contributor: fetch_contributor(doc), relation: fetch_relation(doc),
156
+ language: ["en"], script: ["Latn"], doctype: "document"
157
+ )
158
+ write_file item
159
+ rescue StandardError => e
160
+ warn "Document: #{url}"
161
+ warn e.message
162
+ warn e.backtrace
163
+ end
164
+
165
+ def fetch(url)
166
+ result = time_req { @agent.get url }
167
+ result.xpath("//li[@data-product]").each { |hit| parse_page hit }
168
+ np = result.at '//a[@class="next_page"]'
169
+ fetch "https://www.techstreet.com#{np[:href]}" if np
170
+ end
171
+
172
+ def time_req
173
+ t1 = Time.now
174
+ result = yield
175
+ t = 1 - (Time.now - t1)
176
+ sleep t if t.positive?
177
+ result
178
+ end
179
+
180
+ def self.fetch(output: "data", format: "yaml")
181
+ t1 = Time.now
182
+ puts "Started at: #{t1}"
183
+
184
+ FileUtils.mkdir output unless Dir.exist? output
185
+ new(output, format).fetch URL
186
+
187
+ t2 = Time.now
188
+ puts "Stopped at: #{t2}"
189
+ puts "Done in: #{(t2 - t1).round} sec."
190
+ end
191
+ end
192
+ end
@@ -2,11 +2,12 @@ require "relaton/processor"
2
2
 
3
3
  module RelatonCie
4
4
  class Processor < Relaton::Processor
5
- def initialize
5
+ def initialize # rubocop:disable Lint/MissingSuper
6
6
  @short = :relaton_cie
7
7
  @prefix = "CIE"
8
8
  @defaultprefix = /^CIE(-|\s)/
9
9
  @idtype = "CIE"
10
+ @datasets = %w[cie-techstreet]
10
11
  end
11
12
 
12
13
  # @param code [String]
@@ -17,6 +18,18 @@ module RelatonCie
17
18
  ::RelatonCie::CieBibliography.get(code, date, opts)
18
19
  end
19
20
 
21
+ #
22
+ # Fetch all the docukents from a source
23
+ #
24
+ # @param [String] _source source name
25
+ # @param [Hash] opts
26
+ # @option opts [String] :output directory to output documents
27
+ # @option opts [String] :format
28
+ #
29
+ def fetch_data(_source, opts)
30
+ DataFetcher.fetch(**opts)
31
+ end
32
+
20
33
  # @param xml [String]
21
34
  # @return [RelatonBib::BibliographicItem]
22
35
  def from_xml(xml)
@@ -1,3 +1,3 @@
1
1
  module RelatonCie
2
- VERSION = "1.9.0".freeze
2
+ VERSION = "1.9.1".freeze
3
3
  end
data/lib/relaton_cie.rb CHANGED
@@ -4,6 +4,7 @@ require "relaton_bib"
4
4
  require "relaton_cie/version"
5
5
  require "relaton_cie/cie_bibliography"
6
6
  require "relaton_cie/scrapper"
7
+ require "relaton_cie/data_fetcher"
7
8
 
8
9
  module RelatonCie
9
10
  # Returns hash of XML reammar
data/relaton_cie.gemspec CHANGED
@@ -31,11 +31,11 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
33
33
  spec.add_development_dependency "pry-byebug"
34
- spec.add_development_dependency "rake", "~> 10.0"
35
34
  spec.add_development_dependency "ruby-jing"
36
35
  spec.add_development_dependency "simplecov"
37
36
  spec.add_development_dependency "vcr"
38
37
  spec.add_development_dependency "webmock"
39
38
 
39
+ spec.add_dependency "mechanize", "~> 2.8.0"
40
40
  spec.add_dependency "relaton-bib", "~> 1.9.0"
41
41
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-26 00:00:00.000000000 Z
11
+ date: 2021-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '10.0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '10.0'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: ruby-jing
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +94,20 @@ dependencies:
108
94
  - - ">="
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mechanize
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 2.8.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 2.8.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: relaton-bib
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -139,6 +139,7 @@ files:
139
139
  - README.adoc
140
140
  - Rakefile
141
141
  - bin/console
142
+ - bin/rspec
142
143
  - bin/setup
143
144
  - grammars/basicdoc.rng
144
145
  - grammars/biblio.rng
@@ -146,6 +147,7 @@ files:
146
147
  - grammars/reqt.rng
147
148
  - lib/relaton_cie.rb
148
149
  - lib/relaton_cie/cie_bibliography.rb
150
+ - lib/relaton_cie/data_fetcher.rb
149
151
  - lib/relaton_cie/processor.rb
150
152
  - lib/relaton_cie/scrapper.rb
151
153
  - lib/relaton_cie/version.rb