relaton-cie 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/bin/rspec +29 -0
- data/lib/relaton_cie/data_fetcher.rb +192 -0
- data/lib/relaton_cie/processor.rb +14 -1
- data/lib/relaton_cie/version.rb +1 -1
- data/lib/relaton_cie.rb +1 -0
- data/relaton_cie.gemspec +1 -1
- metadata +18 -16
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2d5d26de612f4a997641b1f7325f48624e0f9d349d43373236bb10ec27115fb8
|
|
4
|
+
data.tar.gz: 31bec1cd855722cff7d70b57106b78182d98683c8c98984027827cf1f8ae9579
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 63027b7c118820397e616d083cd6aed0748fc9eb13efa3d12428a7441bddfb9f31b9375825cea31825cccc3ba518c9ffc39c0e4984299cdf49c519c5dc1d979a
|
|
7
|
+
data.tar.gz: e01df911e2fe2d1974bfc5004bbbf49ffb336872c7bead4722997050e38c141ef349d41f5c92f31d7acebdaa4c02dcca1cde012983c79533255687e77f31d298
|
data/Gemfile
CHANGED
data/bin/rspec
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# This file was generated by Bundler.
|
|
6
|
+
#
|
|
7
|
+
# The application 'rspec' is installed as part of a gem, and
|
|
8
|
+
# this file is here to facilitate running it.
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require "pathname"
|
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
|
13
|
+
Pathname.new(__FILE__).realpath)
|
|
14
|
+
|
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
|
16
|
+
|
|
17
|
+
if File.file?(bundle_binstub)
|
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
|
19
|
+
load(bundle_binstub)
|
|
20
|
+
else
|
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
require "rubygems"
|
|
27
|
+
require "bundler/setup"
|
|
28
|
+
|
|
29
|
+
load Gem.bin_path("rspec-core", "rspec")
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "English"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "mechanize"
|
|
6
|
+
require "relaton_bib"
|
|
7
|
+
|
|
8
|
+
module RelatonCie
|
|
9
|
+
class DataFetcher
|
|
10
|
+
URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
|
|
11
|
+
|
|
12
|
+
def initialize(output, format)
|
|
13
|
+
@agent = Mechanize.new
|
|
14
|
+
@output = output
|
|
15
|
+
@format = format
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param hit [Nokogiri::HTML::Document]
|
|
19
|
+
# @param doc [Mechanize::Page]
|
|
20
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
|
21
|
+
def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
22
|
+
code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/")
|
|
23
|
+
c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
|
|
24
|
+
code = code[0...c2idx].strip if c2idx
|
|
25
|
+
/^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
|
|
26
|
+
if code1.match?(/^CIE/)
|
|
27
|
+
c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
|
|
28
|
+
add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
|
|
29
|
+
c += " #{add[1]} #{add[2]}" if add
|
|
30
|
+
elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
|
|
31
|
+
c = "CIE #{pcode.text.strip.match(/[^,]+/)}"
|
|
32
|
+
else
|
|
33
|
+
num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "")
|
|
34
|
+
.gsub(/,(?=\S)/, " ")
|
|
35
|
+
c = "CIE #{num}"
|
|
36
|
+
end
|
|
37
|
+
docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c)]
|
|
38
|
+
isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
|
|
39
|
+
docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2
|
|
40
|
+
docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
|
|
41
|
+
docid
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @param doc [Mechanize::Page]
|
|
45
|
+
# @return [RelatonBib::TypedTitleStringCollection, Array]
|
|
46
|
+
def fetch_title(doc)
|
|
47
|
+
t = doc.at("//hgroup/h2", "//hgroup/h1")
|
|
48
|
+
return [] unless t
|
|
49
|
+
|
|
50
|
+
RelatonBib::TypedTitleString.from_string t.text.strip
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @param doc [Mechanize::Page]
|
|
54
|
+
# @return [Array<RelatonBib::BibliographicDate>]
|
|
55
|
+
def fetch_date(doc)
|
|
56
|
+
doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d|
|
|
57
|
+
pd = d.text.strip
|
|
58
|
+
on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
|
|
59
|
+
RelatonBib::BibliographicDate.new(type: "published", on: on)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# @param doc [Mechanize::Page]
|
|
64
|
+
# @return [String]
|
|
65
|
+
def fetch_edition(doc)
|
|
66
|
+
doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# @param doc [Mechanize::Page]
|
|
70
|
+
# @return [Array<Hash>]
|
|
71
|
+
def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
72
|
+
doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
|
|
73
|
+
ref = rel.at("a")
|
|
74
|
+
url = "https://www.techstreet.com#{ref[:href]}"
|
|
75
|
+
title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text
|
|
76
|
+
did = ref.at("h3").text
|
|
77
|
+
docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did)]
|
|
78
|
+
on = ref.at("p/time")
|
|
79
|
+
date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])]
|
|
80
|
+
link = [RelatonBib::TypedUri.new(type: "src", content: url)]
|
|
81
|
+
bibitem = RelatonBib::BibliographicItem.new docid: docid, title: title, link: link, date: date
|
|
82
|
+
type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
|
|
83
|
+
{ type: type, bibitem: bibitem }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @param url [String]
|
|
88
|
+
# @return [Array<RelatonBib::TypedUri>]
|
|
89
|
+
def fetch_link(url)
|
|
90
|
+
[RelatonBib::TypedUri.new(type: "src", content: url)]
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# @param doc [Mechanize::Page]
|
|
94
|
+
# @return [Array<RelatonBib::FormattedString>]
|
|
95
|
+
def fetch_abstract(doc)
|
|
96
|
+
content = doc.at('//div[contains(@class,"description")]')&.text&.strip
|
|
97
|
+
return [] if content.nil? || content.empty?
|
|
98
|
+
|
|
99
|
+
[RelatonBib::FormattedString.new(content: content, language: "en",
|
|
100
|
+
script: "Latn")]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# @param doc [Mechanize::Page]
|
|
104
|
+
# @return [Array<Hash>]
|
|
105
|
+
def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
|
|
106
|
+
authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text
|
|
107
|
+
contribs = []
|
|
108
|
+
until authors.empty?
|
|
109
|
+
/^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
|
|
110
|
+
(?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
|
|
111
|
+
(?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))?
|
|
112
|
+
(?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
|
|
113
|
+
(?:(?:,\s?|\s|\.|(?<=\s))(?:and\s)?)?/x =~ authors
|
|
114
|
+
raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO
|
|
115
|
+
|
|
116
|
+
authors.sub! $LAST_MATCH_INFO.to_s, ""
|
|
117
|
+
sname = [sname1, sname2].compact.join " "
|
|
118
|
+
surname = RelatonBib::LocalizedString.new sname, "en", "Latn"
|
|
119
|
+
initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int|
|
|
120
|
+
RelatonBib::LocalizedString.new(int.strip, "en", "Latn")
|
|
121
|
+
end
|
|
122
|
+
forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : []
|
|
123
|
+
fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial
|
|
124
|
+
person = RelatonBib::Person.new name: fullname
|
|
125
|
+
contribs << { entity: person, role: [{ type: "author" }] }
|
|
126
|
+
end
|
|
127
|
+
org = RelatonBib::Organization.new(
|
|
128
|
+
name: "Commission Internationale de L'Eclairage", abbreviation: "CIE",
|
|
129
|
+
url: "cie.co.at"
|
|
130
|
+
)
|
|
131
|
+
contribs << { entity: org, role: [{ type: "publisher" }] }
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# @param bib [RelatonItu::ItuBibliographicItem]
|
|
135
|
+
def write_file(bib)
|
|
136
|
+
id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
|
|
137
|
+
file = "#{@output}/#{id.upcase}.#{@format}"
|
|
138
|
+
# if File.exist? file
|
|
139
|
+
# warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
|
|
140
|
+
# warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
|
|
141
|
+
# else
|
|
142
|
+
out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
|
|
143
|
+
File.write file, out, encoding: "UTF-8"
|
|
144
|
+
# end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# @param hit [Nokogiri::HTML::Element]
|
|
148
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
149
|
+
url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
|
|
150
|
+
doc = time_req { @agent.get url }
|
|
151
|
+
item = RelatonBib::BibliographicItem.new(
|
|
152
|
+
type: "standard", docid: fetch_docid(hit, doc), title: fetch_title(doc),
|
|
153
|
+
link: fetch_link(url), abstract: fetch_abstract(doc),
|
|
154
|
+
date: fetch_date(doc), edition: fetch_edition(doc),
|
|
155
|
+
contributor: fetch_contributor(doc), relation: fetch_relation(doc),
|
|
156
|
+
language: ["en"], script: ["Latn"], doctype: "document"
|
|
157
|
+
)
|
|
158
|
+
write_file item
|
|
159
|
+
rescue StandardError => e
|
|
160
|
+
warn "Document: #{url}"
|
|
161
|
+
warn e.message
|
|
162
|
+
warn e.backtrace
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def fetch(url)
|
|
166
|
+
result = time_req { @agent.get url }
|
|
167
|
+
result.xpath("//li[@data-product]").each { |hit| parse_page hit }
|
|
168
|
+
np = result.at '//a[@class="next_page"]'
|
|
169
|
+
fetch "https://www.techstreet.com#{np[:href]}" if np
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def time_req
|
|
173
|
+
t1 = Time.now
|
|
174
|
+
result = yield
|
|
175
|
+
t = 1 - (Time.now - t1)
|
|
176
|
+
sleep t if t.positive?
|
|
177
|
+
result
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def self.fetch(output: "data", format: "yaml")
|
|
181
|
+
t1 = Time.now
|
|
182
|
+
puts "Started at: #{t1}"
|
|
183
|
+
|
|
184
|
+
FileUtils.mkdir output unless Dir.exist? output
|
|
185
|
+
new(output, format).fetch URL
|
|
186
|
+
|
|
187
|
+
t2 = Time.now
|
|
188
|
+
puts "Stopped at: #{t2}"
|
|
189
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
@@ -2,11 +2,12 @@ require "relaton/processor"
|
|
|
2
2
|
|
|
3
3
|
module RelatonCie
|
|
4
4
|
class Processor < Relaton::Processor
|
|
5
|
-
def initialize
|
|
5
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
|
6
6
|
@short = :relaton_cie
|
|
7
7
|
@prefix = "CIE"
|
|
8
8
|
@defaultprefix = /^CIE(-|\s)/
|
|
9
9
|
@idtype = "CIE"
|
|
10
|
+
@datasets = %w[cie-techstreet]
|
|
10
11
|
end
|
|
11
12
|
|
|
12
13
|
# @param code [String]
|
|
@@ -17,6 +18,18 @@ module RelatonCie
|
|
|
17
18
|
::RelatonCie::CieBibliography.get(code, date, opts)
|
|
18
19
|
end
|
|
19
20
|
|
|
21
|
+
#
|
|
22
|
+
# Fetch all the docukents from a source
|
|
23
|
+
#
|
|
24
|
+
# @param [String] _source source name
|
|
25
|
+
# @param [Hash] opts
|
|
26
|
+
# @option opts [String] :output directory to output documents
|
|
27
|
+
# @option opts [String] :format
|
|
28
|
+
#
|
|
29
|
+
def fetch_data(_source, opts)
|
|
30
|
+
DataFetcher.fetch(**opts)
|
|
31
|
+
end
|
|
32
|
+
|
|
20
33
|
# @param xml [String]
|
|
21
34
|
# @return [RelatonBib::BibliographicItem]
|
|
22
35
|
def from_xml(xml)
|
data/lib/relaton_cie/version.rb
CHANGED
data/lib/relaton_cie.rb
CHANGED
data/relaton_cie.gemspec
CHANGED
|
@@ -31,11 +31,11 @@ Gem::Specification.new do |spec|
|
|
|
31
31
|
|
|
32
32
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
|
33
33
|
spec.add_development_dependency "pry-byebug"
|
|
34
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
|
35
34
|
spec.add_development_dependency "ruby-jing"
|
|
36
35
|
spec.add_development_dependency "simplecov"
|
|
37
36
|
spec.add_development_dependency "vcr"
|
|
38
37
|
spec.add_development_dependency "webmock"
|
|
39
38
|
|
|
39
|
+
spec.add_dependency "mechanize", "~> 2.8.0"
|
|
40
40
|
spec.add_dependency "relaton-bib", "~> 1.9.0"
|
|
41
41
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-cie
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.9.
|
|
4
|
+
version: 1.9.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-08
|
|
11
|
+
date: 2021-09-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: equivalent-xml
|
|
@@ -38,20 +38,6 @@ dependencies:
|
|
|
38
38
|
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
|
-
- !ruby/object:Gem::Dependency
|
|
42
|
-
name: rake
|
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
|
44
|
-
requirements:
|
|
45
|
-
- - "~>"
|
|
46
|
-
- !ruby/object:Gem::Version
|
|
47
|
-
version: '10.0'
|
|
48
|
-
type: :development
|
|
49
|
-
prerelease: false
|
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
-
requirements:
|
|
52
|
-
- - "~>"
|
|
53
|
-
- !ruby/object:Gem::Version
|
|
54
|
-
version: '10.0'
|
|
55
41
|
- !ruby/object:Gem::Dependency
|
|
56
42
|
name: ruby-jing
|
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -108,6 +94,20 @@ dependencies:
|
|
|
108
94
|
- - ">="
|
|
109
95
|
- !ruby/object:Gem::Version
|
|
110
96
|
version: '0'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: mechanize
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - "~>"
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: 2.8.0
|
|
104
|
+
type: :runtime
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - "~>"
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: 2.8.0
|
|
111
111
|
- !ruby/object:Gem::Dependency
|
|
112
112
|
name: relaton-bib
|
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -139,6 +139,7 @@ files:
|
|
|
139
139
|
- README.adoc
|
|
140
140
|
- Rakefile
|
|
141
141
|
- bin/console
|
|
142
|
+
- bin/rspec
|
|
142
143
|
- bin/setup
|
|
143
144
|
- grammars/basicdoc.rng
|
|
144
145
|
- grammars/biblio.rng
|
|
@@ -146,6 +147,7 @@ files:
|
|
|
146
147
|
- grammars/reqt.rng
|
|
147
148
|
- lib/relaton_cie.rb
|
|
148
149
|
- lib/relaton_cie/cie_bibliography.rb
|
|
150
|
+
- lib/relaton_cie/data_fetcher.rb
|
|
149
151
|
- lib/relaton_cie/processor.rb
|
|
150
152
|
- lib/relaton_cie/scrapper.rb
|
|
151
153
|
- lib/relaton_cie/version.rb
|