relaton-omg 1.18.0 → 1.18.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_omg/omg_bibliography.rb +1 -1
- data/lib/relaton_omg/scraper.rb +136 -0
- data/lib/relaton_omg/version.rb +1 -1
- data/lib/relaton_omg.rb +1 -1
- metadata +4 -4
- data/lib/relaton_omg/scrapper.rb +0 -121
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3aee35ace5d33ef6a4058a8da3c76d59af173315a7ed32cb5967c0dfb9fd8296
|
4
|
+
data.tar.gz: a1138366be2d18d01354d29db3a8fc0870c4c73cbbbe3ac67ffa3700ae85f9bc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5194c85224823ac10a951ed66de9c558659751b967d909d53cbbee37ad60eb750f92b60c079fd1f125c324e23dbb651072a40eefd77018d50776331f7228b37
|
7
|
+
data.tar.gz: 1b9ef36597bec1b10285d0e5f3583615aa774f0b901d98d3042477f6c9368c20e9cf60d4f47d00564f0effc99c9d4d5bd236ab0c62e5ee70531151da899cabc3
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module RelatonOmg
|
4
|
+
class Scraper
|
5
|
+
URL_PATTERN = "https://www.omg.org/spec/".freeze
|
6
|
+
|
7
|
+
def initialize(acronym, version = nil, spec = nil)
|
8
|
+
@acronym = acronym
|
9
|
+
@version = version
|
10
|
+
@spec = spec
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.scrape_page(ref)
|
14
|
+
%r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
|
15
|
+
return unless acronym
|
16
|
+
|
17
|
+
scraper = new(acronym, version, spec)
|
18
|
+
doc = scraper.get_doc
|
19
|
+
return if doc.nil? || scraper.fetch_link.empty?
|
20
|
+
|
21
|
+
OmgBibliographicItem.new(**scraper.item)
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_doc
|
25
|
+
@url = "#{URL_PATTERN}#{@acronym}/"
|
26
|
+
@url += @version.gsub(' ', '/') if @version
|
27
|
+
@doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
|
28
|
+
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
|
29
|
+
return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
|
30
|
+
|
31
|
+
raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
|
32
|
+
end
|
33
|
+
|
34
|
+
def item
|
35
|
+
{
|
36
|
+
id: fetch_id,
|
37
|
+
fetched: Date.today.to_s,
|
38
|
+
docid: fetch_docid,
|
39
|
+
title: fetch_title,
|
40
|
+
abstract: fetch_abstract,
|
41
|
+
version: fetch_version,
|
42
|
+
date: fetch_date,
|
43
|
+
docstatus: fetch_status,
|
44
|
+
link: fetch_link,
|
45
|
+
relation: fetch_relation,
|
46
|
+
keyword: fetch_keyword,
|
47
|
+
license: fetch_license,
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def fetch_id
|
52
|
+
"#{@acronym}#{doc_version}#{@spec}"
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_title
|
56
|
+
content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
|
57
|
+
content += ": #{@spec}" if @spec
|
58
|
+
title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
|
59
|
+
[RelatonBib::TypedTitleString.new(type: "main", title: title)]
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_docid
|
63
|
+
id = [@acronym]
|
64
|
+
id << doc_version if doc_version
|
65
|
+
id << @spec if @spec
|
66
|
+
[RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
|
67
|
+
end
|
68
|
+
|
69
|
+
def fetch_abstract
|
70
|
+
content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
|
71
|
+
[{ content: content, language: "en", script: "Latn" }]
|
72
|
+
end
|
73
|
+
|
74
|
+
def fetch_version
|
75
|
+
[RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
|
76
|
+
end
|
77
|
+
|
78
|
+
def doc_version
|
79
|
+
@doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
|
80
|
+
end
|
81
|
+
|
82
|
+
def fetch_date
|
83
|
+
[type: "published", on: pub_date.to_s]
|
84
|
+
end
|
85
|
+
|
86
|
+
def pub_date
|
87
|
+
Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
|
88
|
+
end
|
89
|
+
|
90
|
+
def fetch_status
|
91
|
+
status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
|
92
|
+
stage = status.text.strip.match(/\w+/).to_s
|
93
|
+
RelatonBib::DocumentStatus.new(stage: stage)
|
94
|
+
end
|
95
|
+
|
96
|
+
def fetch_link
|
97
|
+
return @link if @link
|
98
|
+
|
99
|
+
@links = []
|
100
|
+
if @spec
|
101
|
+
a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
|
102
|
+
@links << { type: "src", content: a[:href] } if a
|
103
|
+
else
|
104
|
+
a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
|
105
|
+
@links << { type: "src", content: a[:href] } if a
|
106
|
+
pdf = @doc.at('//a[@class="download-document"]')
|
107
|
+
@links << { type: "pdf", content: pdf[:href] } if pdf
|
108
|
+
end
|
109
|
+
@links
|
110
|
+
end
|
111
|
+
|
112
|
+
def fetch_relation
|
113
|
+
v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
|
114
|
+
v.reduce([]) do |mem, row|
|
115
|
+
ver = row.at("td").text
|
116
|
+
unless ver == doc_version
|
117
|
+
acronym = row.at("td[3]/a")[:href].split("/")[4]
|
118
|
+
fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
|
119
|
+
bibitem = OmgBibliographicItem.new formattedref: fref
|
120
|
+
mem << { type: "obsoletes", bibitem: bibitem }
|
121
|
+
end
|
122
|
+
mem
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def fetch_keyword
|
127
|
+
@doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
|
128
|
+
end
|
129
|
+
|
130
|
+
def fetch_license
|
131
|
+
@doc.xpath(
|
132
|
+
'//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
|
133
|
+
).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/relaton_omg/version.rb
CHANGED
data/lib/relaton_omg.rb
CHANGED
@@ -3,7 +3,7 @@ require "relaton_bib"
|
|
3
3
|
require "relaton_omg/version"
|
4
4
|
require "relaton_omg/config"
|
5
5
|
require "relaton_omg/util"
|
6
|
-
require "relaton_omg/
|
6
|
+
require "relaton_omg/scraper"
|
7
7
|
require "relaton_omg/omg_bibliography"
|
8
8
|
require "relaton_omg/omg_bibliographic_item"
|
9
9
|
require "relaton_omg/xml_parser"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-omg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.18.
|
4
|
+
version: 1.18.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-06-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: relaton-bib
|
@@ -56,7 +56,7 @@ files:
|
|
56
56
|
- lib/relaton_omg/omg_bibliographic_item.rb
|
57
57
|
- lib/relaton_omg/omg_bibliography.rb
|
58
58
|
- lib/relaton_omg/processor.rb
|
59
|
-
- lib/relaton_omg/
|
59
|
+
- lib/relaton_omg/scraper.rb
|
60
60
|
- lib/relaton_omg/util.rb
|
61
61
|
- lib/relaton_omg/version.rb
|
62
62
|
- lib/relaton_omg/xml_parser.rb
|
@@ -82,7 +82,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
82
|
- !ruby/object:Gem::Version
|
83
83
|
version: '0'
|
84
84
|
requirements: []
|
85
|
-
rubygems_version: 3.3.
|
85
|
+
rubygems_version: 3.3.27
|
86
86
|
signing_key:
|
87
87
|
specification_version: 4
|
88
88
|
summary: 'RelatonOmg: retrieve OMG Standards for bibliographic using the IsoBibliographicItem
|
data/lib/relaton_omg/scrapper.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
|
3
|
-
module RelatonOmg
|
4
|
-
module Scrapper
|
5
|
-
URL_PATTERN = "https://www.omg.org/spec/".freeze
|
6
|
-
|
7
|
-
class << self
|
8
|
-
def scrape_page(ref) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
9
|
-
%r{OMG (?<acronym>[^\s]+)\s?(?<version>.*)} =~ ref
|
10
|
-
return unless acronym
|
11
|
-
|
12
|
-
url = URL_PATTERN + acronym
|
13
|
-
url += "/#{version}" if version
|
14
|
-
doc = Nokogiri::HTML OpenURI.open_uri(url, open_timeout: 10)
|
15
|
-
OmgBibliographicItem.new(**item(doc, acronym))
|
16
|
-
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
|
17
|
-
return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
|
18
|
-
|
19
|
-
raise RelatonBib::RequestError, "Unable acces #{url} (#{e.io.status.join(' ')})"
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def item(doc, acronym) # rubocop:disable Metrics/MethodLength
|
25
|
-
{
|
26
|
-
id: fetch_id(doc, acronym),
|
27
|
-
fetched: Date.today.to_s,
|
28
|
-
docid: fetch_docid(doc, acronym),
|
29
|
-
title: fetch_title(doc),
|
30
|
-
abstract: fetch_abstract(doc),
|
31
|
-
version: fetch_version(doc),
|
32
|
-
date: fetch_date(doc),
|
33
|
-
docstatus: fetch_status(doc),
|
34
|
-
link: fetch_link(doc),
|
35
|
-
relation: fetch_relation(doc),
|
36
|
-
keyword: fetch_keyword(doc),
|
37
|
-
license: fetch_license(doc),
|
38
|
-
}
|
39
|
-
end
|
40
|
-
|
41
|
-
def fetch_id(doc, acronym)
|
42
|
-
acronym + version(doc)
|
43
|
-
end
|
44
|
-
|
45
|
-
def fetch_title(doc)
|
46
|
-
content = doc.at('//dt[.="Title:"]/following-sibling::dd').text
|
47
|
-
title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
|
48
|
-
[RelatonBib::TypedTitleString.new(type: "main", title: title)]
|
49
|
-
end
|
50
|
-
|
51
|
-
def fetch_docid(doc, acronym)
|
52
|
-
id = [acronym]
|
53
|
-
if (ver = version(doc))
|
54
|
-
id << ver
|
55
|
-
end
|
56
|
-
[RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
|
57
|
-
end
|
58
|
-
|
59
|
-
def fetch_abstract(doc)
|
60
|
-
content = doc.at('//section[@id="document-metadata"]/div/div/p').text
|
61
|
-
[{ content: content, language: "en", script: "Latn" }]
|
62
|
-
end
|
63
|
-
|
64
|
-
def fetch_version(doc)
|
65
|
-
[RelatonBib::BibliographicItem::Version.new(pub_date(doc), version(doc))]
|
66
|
-
end
|
67
|
-
|
68
|
-
def version(doc)
|
69
|
-
doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
|
70
|
-
end
|
71
|
-
|
72
|
-
def fetch_date(doc)
|
73
|
-
[type: "published", on: pub_date(doc).to_s]
|
74
|
-
end
|
75
|
-
|
76
|
-
def pub_date(doc)
|
77
|
-
Date.parse doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
|
78
|
-
end
|
79
|
-
|
80
|
-
def fetch_status(doc)
|
81
|
-
status = doc.at('//dt[.="Document Status:"]/following-sibling::dd')
|
82
|
-
stage = status.text.strip.match(/\w+/).to_s
|
83
|
-
RelatonBib::DocumentStatus.new(stage: stage)
|
84
|
-
end
|
85
|
-
|
86
|
-
def fetch_link(doc)
|
87
|
-
links = []
|
88
|
-
a = doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
|
89
|
-
links << { type: "src", content: a[:href] } if a
|
90
|
-
pdf = doc.at('//a[@class="download-document"]')
|
91
|
-
links << { type: "pdf", content: pdf[:href] } if pdf
|
92
|
-
links
|
93
|
-
end
|
94
|
-
|
95
|
-
def fetch_relation(doc) # rubocop:disable Metrics/MethodLength
|
96
|
-
current_version = version(doc)
|
97
|
-
v = doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
|
98
|
-
v.reduce([]) do |mem, row|
|
99
|
-
ver = row.at("td").text
|
100
|
-
unless ver == current_version
|
101
|
-
acronym = row.at("td[3]/a")[:href].split("/")[4]
|
102
|
-
fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
|
103
|
-
bibitem = OmgBibliographicItem.new formattedref: fref
|
104
|
-
mem << { type: "obsoletes", bibitem: bibitem }
|
105
|
-
end
|
106
|
-
mem
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def fetch_keyword(doc)
|
111
|
-
doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
|
112
|
-
end
|
113
|
-
|
114
|
-
def fetch_license(doc)
|
115
|
-
doc.xpath(
|
116
|
-
'//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
|
117
|
-
).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
|
118
|
-
end
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|