relaton-omg 1.18.0 → 1.18.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_omg/omg_bibliography.rb +1 -1
- data/lib/relaton_omg/scraper.rb +136 -0
- data/lib/relaton_omg/version.rb +1 -1
- data/lib/relaton_omg.rb +1 -1
- metadata +4 -4
- data/lib/relaton_omg/scrapper.rb +0 -121
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3aee35ace5d33ef6a4058a8da3c76d59af173315a7ed32cb5967c0dfb9fd8296
|
|
4
|
+
data.tar.gz: a1138366be2d18d01354d29db3a8fc0870c4c73cbbbe3ac67ffa3700ae85f9bc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b5194c85224823ac10a951ed66de9c558659751b967d909d53cbbee37ad60eb750f92b60c079fd1f125c324e23dbb651072a40eefd77018d50776331f7228b37
|
|
7
|
+
data.tar.gz: 1b9ef36597bec1b10285d0e5f3583615aa774f0b901d98d3042477f6c9368c20e9cf60d4f47d00564f0effc99c9d4d5bd236ab0c62e5ee70531151da899cabc3
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
require "nokogiri"
|
|
2
|
+
|
|
3
|
+
module RelatonOmg
|
|
4
|
+
class Scraper
|
|
5
|
+
URL_PATTERN = "https://www.omg.org/spec/".freeze
|
|
6
|
+
|
|
7
|
+
def initialize(acronym, version = nil, spec = nil)
|
|
8
|
+
@acronym = acronym
|
|
9
|
+
@version = version
|
|
10
|
+
@spec = spec
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.scrape_page(ref)
|
|
14
|
+
%r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
|
|
15
|
+
return unless acronym
|
|
16
|
+
|
|
17
|
+
scraper = new(acronym, version, spec)
|
|
18
|
+
doc = scraper.get_doc
|
|
19
|
+
return if doc.nil? || scraper.fetch_link.empty?
|
|
20
|
+
|
|
21
|
+
OmgBibliographicItem.new(**scraper.item)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def get_doc
|
|
25
|
+
@url = "#{URL_PATTERN}#{@acronym}/"
|
|
26
|
+
@url += @version.gsub(' ', '/') if @version
|
|
27
|
+
@doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
|
|
28
|
+
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
|
|
29
|
+
return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
|
|
30
|
+
|
|
31
|
+
raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def item
|
|
35
|
+
{
|
|
36
|
+
id: fetch_id,
|
|
37
|
+
fetched: Date.today.to_s,
|
|
38
|
+
docid: fetch_docid,
|
|
39
|
+
title: fetch_title,
|
|
40
|
+
abstract: fetch_abstract,
|
|
41
|
+
version: fetch_version,
|
|
42
|
+
date: fetch_date,
|
|
43
|
+
docstatus: fetch_status,
|
|
44
|
+
link: fetch_link,
|
|
45
|
+
relation: fetch_relation,
|
|
46
|
+
keyword: fetch_keyword,
|
|
47
|
+
license: fetch_license,
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def fetch_id
|
|
52
|
+
"#{@acronym}#{doc_version}#{@spec}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def fetch_title
|
|
56
|
+
content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
|
|
57
|
+
content += ": #{@spec}" if @spec
|
|
58
|
+
title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
|
|
59
|
+
[RelatonBib::TypedTitleString.new(type: "main", title: title)]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def fetch_docid
|
|
63
|
+
id = [@acronym]
|
|
64
|
+
id << doc_version if doc_version
|
|
65
|
+
id << @spec if @spec
|
|
66
|
+
[RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def fetch_abstract
|
|
70
|
+
content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
|
|
71
|
+
[{ content: content, language: "en", script: "Latn" }]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def fetch_version
|
|
75
|
+
[RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def doc_version
|
|
79
|
+
@doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def fetch_date
|
|
83
|
+
[type: "published", on: pub_date.to_s]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def pub_date
|
|
87
|
+
Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def fetch_status
|
|
91
|
+
status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
|
|
92
|
+
stage = status.text.strip.match(/\w+/).to_s
|
|
93
|
+
RelatonBib::DocumentStatus.new(stage: stage)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def fetch_link
|
|
97
|
+
return @link if @link
|
|
98
|
+
|
|
99
|
+
@links = []
|
|
100
|
+
if @spec
|
|
101
|
+
a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
|
|
102
|
+
@links << { type: "src", content: a[:href] } if a
|
|
103
|
+
else
|
|
104
|
+
a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
|
|
105
|
+
@links << { type: "src", content: a[:href] } if a
|
|
106
|
+
pdf = @doc.at('//a[@class="download-document"]')
|
|
107
|
+
@links << { type: "pdf", content: pdf[:href] } if pdf
|
|
108
|
+
end
|
|
109
|
+
@links
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def fetch_relation
|
|
113
|
+
v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
|
|
114
|
+
v.reduce([]) do |mem, row|
|
|
115
|
+
ver = row.at("td").text
|
|
116
|
+
unless ver == doc_version
|
|
117
|
+
acronym = row.at("td[3]/a")[:href].split("/")[4]
|
|
118
|
+
fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
|
|
119
|
+
bibitem = OmgBibliographicItem.new formattedref: fref
|
|
120
|
+
mem << { type: "obsoletes", bibitem: bibitem }
|
|
121
|
+
end
|
|
122
|
+
mem
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def fetch_keyword
|
|
127
|
+
@doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def fetch_license
|
|
131
|
+
@doc.xpath(
|
|
132
|
+
'//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
|
|
133
|
+
).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
data/lib/relaton_omg/version.rb
CHANGED
data/lib/relaton_omg.rb
CHANGED
|
@@ -3,7 +3,7 @@ require "relaton_bib"
|
|
|
3
3
|
require "relaton_omg/version"
|
|
4
4
|
require "relaton_omg/config"
|
|
5
5
|
require "relaton_omg/util"
|
|
6
|
-
require "relaton_omg/
|
|
6
|
+
require "relaton_omg/scraper"
|
|
7
7
|
require "relaton_omg/omg_bibliography"
|
|
8
8
|
require "relaton_omg/omg_bibliographic_item"
|
|
9
9
|
require "relaton_omg/xml_parser"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-omg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.18.
|
|
4
|
+
version: 1.18.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-06-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: relaton-bib
|
|
@@ -56,7 +56,7 @@ files:
|
|
|
56
56
|
- lib/relaton_omg/omg_bibliographic_item.rb
|
|
57
57
|
- lib/relaton_omg/omg_bibliography.rb
|
|
58
58
|
- lib/relaton_omg/processor.rb
|
|
59
|
-
- lib/relaton_omg/
|
|
59
|
+
- lib/relaton_omg/scraper.rb
|
|
60
60
|
- lib/relaton_omg/util.rb
|
|
61
61
|
- lib/relaton_omg/version.rb
|
|
62
62
|
- lib/relaton_omg/xml_parser.rb
|
|
@@ -82,7 +82,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
82
82
|
- !ruby/object:Gem::Version
|
|
83
83
|
version: '0'
|
|
84
84
|
requirements: []
|
|
85
|
-
rubygems_version: 3.3.
|
|
85
|
+
rubygems_version: 3.3.27
|
|
86
86
|
signing_key:
|
|
87
87
|
specification_version: 4
|
|
88
88
|
summary: 'RelatonOmg: retrieve OMG Standards for bibliographic using the IsoBibliographicItem
|
data/lib/relaton_omg/scrapper.rb
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
require "nokogiri"
|
|
2
|
-
|
|
3
|
-
module RelatonOmg
|
|
4
|
-
module Scrapper
|
|
5
|
-
URL_PATTERN = "https://www.omg.org/spec/".freeze
|
|
6
|
-
|
|
7
|
-
class << self
|
|
8
|
-
def scrape_page(ref) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
9
|
-
%r{OMG (?<acronym>[^\s]+)\s?(?<version>.*)} =~ ref
|
|
10
|
-
return unless acronym
|
|
11
|
-
|
|
12
|
-
url = URL_PATTERN + acronym
|
|
13
|
-
url += "/#{version}" if version
|
|
14
|
-
doc = Nokogiri::HTML OpenURI.open_uri(url, open_timeout: 10)
|
|
15
|
-
OmgBibliographicItem.new(**item(doc, acronym))
|
|
16
|
-
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
|
|
17
|
-
return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
|
|
18
|
-
|
|
19
|
-
raise RelatonBib::RequestError, "Unable acces #{url} (#{e.io.status.join(' ')})"
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
private
|
|
23
|
-
|
|
24
|
-
def item(doc, acronym) # rubocop:disable Metrics/MethodLength
|
|
25
|
-
{
|
|
26
|
-
id: fetch_id(doc, acronym),
|
|
27
|
-
fetched: Date.today.to_s,
|
|
28
|
-
docid: fetch_docid(doc, acronym),
|
|
29
|
-
title: fetch_title(doc),
|
|
30
|
-
abstract: fetch_abstract(doc),
|
|
31
|
-
version: fetch_version(doc),
|
|
32
|
-
date: fetch_date(doc),
|
|
33
|
-
docstatus: fetch_status(doc),
|
|
34
|
-
link: fetch_link(doc),
|
|
35
|
-
relation: fetch_relation(doc),
|
|
36
|
-
keyword: fetch_keyword(doc),
|
|
37
|
-
license: fetch_license(doc),
|
|
38
|
-
}
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def fetch_id(doc, acronym)
|
|
42
|
-
acronym + version(doc)
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
def fetch_title(doc)
|
|
46
|
-
content = doc.at('//dt[.="Title:"]/following-sibling::dd').text
|
|
47
|
-
title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
|
|
48
|
-
[RelatonBib::TypedTitleString.new(type: "main", title: title)]
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def fetch_docid(doc, acronym)
|
|
52
|
-
id = [acronym]
|
|
53
|
-
if (ver = version(doc))
|
|
54
|
-
id << ver
|
|
55
|
-
end
|
|
56
|
-
[RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def fetch_abstract(doc)
|
|
60
|
-
content = doc.at('//section[@id="document-metadata"]/div/div/p').text
|
|
61
|
-
[{ content: content, language: "en", script: "Latn" }]
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def fetch_version(doc)
|
|
65
|
-
[RelatonBib::BibliographicItem::Version.new(pub_date(doc), version(doc))]
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def version(doc)
|
|
69
|
-
doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
def fetch_date(doc)
|
|
73
|
-
[type: "published", on: pub_date(doc).to_s]
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
def pub_date(doc)
|
|
77
|
-
Date.parse doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def fetch_status(doc)
|
|
81
|
-
status = doc.at('//dt[.="Document Status:"]/following-sibling::dd')
|
|
82
|
-
stage = status.text.strip.match(/\w+/).to_s
|
|
83
|
-
RelatonBib::DocumentStatus.new(stage: stage)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def fetch_link(doc)
|
|
87
|
-
links = []
|
|
88
|
-
a = doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
|
|
89
|
-
links << { type: "src", content: a[:href] } if a
|
|
90
|
-
pdf = doc.at('//a[@class="download-document"]')
|
|
91
|
-
links << { type: "pdf", content: pdf[:href] } if pdf
|
|
92
|
-
links
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
def fetch_relation(doc) # rubocop:disable Metrics/MethodLength
|
|
96
|
-
current_version = version(doc)
|
|
97
|
-
v = doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
|
|
98
|
-
v.reduce([]) do |mem, row|
|
|
99
|
-
ver = row.at("td").text
|
|
100
|
-
unless ver == current_version
|
|
101
|
-
acronym = row.at("td[3]/a")[:href].split("/")[4]
|
|
102
|
-
fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
|
|
103
|
-
bibitem = OmgBibliographicItem.new formattedref: fref
|
|
104
|
-
mem << { type: "obsoletes", bibitem: bibitem }
|
|
105
|
-
end
|
|
106
|
-
mem
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
def fetch_keyword(doc)
|
|
111
|
-
doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
def fetch_license(doc)
|
|
115
|
-
doc.xpath(
|
|
116
|
-
'//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
|
|
117
|
-
).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
|
|
118
|
-
end
|
|
119
|
-
end
|
|
120
|
-
end
|
|
121
|
-
end
|