relaton-nist 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Gemfile.lock +5 -3
- data/lib/relaton_nist.rb +16 -1
- data/lib/relaton_nist/data/pubs-export.zip +0 -0
- data/lib/relaton_nist/hit_collection.rb +97 -30
- data/lib/relaton_nist/nist_bibliography.rb +12 -17
- data/lib/relaton_nist/scrapper.rb +246 -107
- data/lib/relaton_nist/version.rb +1 -1
- data/relaton_nist.gemspec +1 -0
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 42ae6073ff5e1cbaba11be06d27c9e0da310bcf4
|
4
|
+
data.tar.gz: fd4e493fc0f7f3edefe2af58f26a76954c2b1e2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51ca4ccf407bb4355f669fc82ea2950a720f75af506174a58b807c86e109b21808fc4f380363258941c37328e544f9658c75d9ee24966124f3532f0abf8df421
|
7
|
+
data.tar.gz: e23643ecd8e7a685f2542660bf064ef30ff027460b9898aa6a1435f21f7a2ec4b311bdaf7306ed078e920b5b728d7b46402d368d9d6535bc968474793e30ad1a
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
relaton-nist (0.2.
|
4
|
+
relaton-nist (0.2.2)
|
5
5
|
relaton-bib (~> 0.2.0)
|
6
|
+
rubyzip
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -32,9 +33,9 @@ GEM
|
|
32
33
|
pry-byebug (3.7.0)
|
33
34
|
byebug (~> 11.0)
|
34
35
|
pry (~> 0.10)
|
35
|
-
public_suffix (3.1.
|
36
|
+
public_suffix (3.1.1)
|
36
37
|
rake (10.5.0)
|
37
|
-
relaton-bib (0.2.
|
38
|
+
relaton-bib (0.2.3)
|
38
39
|
addressable
|
39
40
|
nokogiri (~> 1.8.4)
|
40
41
|
rspec (3.8.0)
|
@@ -52,6 +53,7 @@ GEM
|
|
52
53
|
rspec-support (3.8.2)
|
53
54
|
ruby-debug-ide (0.7.0)
|
54
55
|
rake (>= 0.8.1)
|
56
|
+
rubyzip (1.2.3)
|
55
57
|
safe_yaml (1.0.5)
|
56
58
|
simplecov (0.16.1)
|
57
59
|
docile (~> 1.1)
|
data/lib/relaton_nist.rb
CHANGED
@@ -8,5 +8,20 @@ end
|
|
8
8
|
|
9
9
|
module RelatonNist
|
10
10
|
class Error < StandardError; end
|
11
|
-
|
11
|
+
|
12
|
+
class << self
|
13
|
+
# @param date [String]
|
14
|
+
# @return [Date, NilClass]
|
15
|
+
def parse_date(sdate)
|
16
|
+
if /(?<date>\w+\s\d{4})/ =~ sdate # February 2012
|
17
|
+
Date.strptime(date, "%B %Y")
|
18
|
+
elsif /(?<date>\w+\s\d{1,2},\s\d{4})/ =~ sdate # February 11, 2012
|
19
|
+
Date.strptime(date, "%B %d, %Y")
|
20
|
+
elsif /(?<date>\d{4}-\d{2}-\d{2})/ =~ sdate # 2012-02-11
|
21
|
+
Date.parse(date)
|
22
|
+
elsif /(?<date>\d{4}-\d{2})/ =~ sdate # 2012-02
|
23
|
+
Date.strptime date, "%Y-%m"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
12
27
|
end
|
Binary file
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "zip"
|
4
|
+
require "fileutils"
|
3
5
|
require "relaton_nist/hit"
|
4
6
|
require "addressable/uri"
|
5
7
|
require "open-uri"
|
@@ -7,8 +9,8 @@ require "open-uri"
|
|
7
9
|
module RelatonNist
|
8
10
|
# Page of hit collection.
|
9
11
|
class HitCollection < Array
|
10
|
-
|
11
12
|
DOMAIN = "https://csrc.nist.gov"
|
13
|
+
DATAFILE = File.expand_path "data/pubs-export.zip", __dir__
|
12
14
|
|
13
15
|
# @return [TrueClass, FalseClass]
|
14
16
|
attr_reader :fetched
|
@@ -28,13 +30,58 @@ module RelatonNist
|
|
28
30
|
def initialize(ref_nbr, year = nil, opts = {})
|
29
31
|
@text = ref_nbr
|
30
32
|
@year = year
|
33
|
+
|
34
|
+
/(?<docid>(SP|FIPS)\s[0-9-]+)/ =~ text
|
35
|
+
hits = docid ? from_json(docid, **opts) : from_csrc(**opts)
|
36
|
+
|
37
|
+
hits.sort! do |a, b|
|
38
|
+
if a.sort_value != b.sort_value
|
39
|
+
b.sort_value - a.sort_value
|
40
|
+
else
|
41
|
+
(b.hit[:release_date] - a.hit[:release_date]).to_i
|
42
|
+
end
|
43
|
+
end
|
44
|
+
concat hits
|
45
|
+
@fetched = false
|
46
|
+
end
|
47
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
48
|
+
|
49
|
+
# @return [Iecbib::HitCollection]
|
50
|
+
def fetch
|
51
|
+
workers = RelatonBib::WorkersPool.new 4
|
52
|
+
workers.worker(&:fetch)
|
53
|
+
each do |hit|
|
54
|
+
workers << hit
|
55
|
+
end
|
56
|
+
workers.end
|
57
|
+
workers.result
|
58
|
+
@fetched = true
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
inspect
|
64
|
+
end
|
65
|
+
|
66
|
+
# @return [String]
|
67
|
+
def inspect
|
68
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
74
|
+
|
75
|
+
# @param stage [String]
|
76
|
+
# @return [Array<RelatonNist::Hit>]
|
77
|
+
def from_csrc(**opts)
|
31
78
|
from, to = nil
|
32
79
|
if year
|
33
|
-
d
|
80
|
+
d = Date.strptime year, "%Y"
|
34
81
|
from = d.strftime "%m/%d/%Y"
|
35
82
|
to = d.next_year.prev_day.strftime "%m/%d/%Y"
|
36
83
|
end
|
37
|
-
url = "#{DOMAIN}/publications/search?keywords-lg=#{
|
84
|
+
url = "#{DOMAIN}/publications/search?keywords-lg=#{text}"
|
38
85
|
url += "&dateFrom-lg=#{from}" if from
|
39
86
|
url += "&dateTo-lg=#{to}" if to
|
40
87
|
url += if /PD/ =~ opts[:stage]
|
@@ -44,7 +91,7 @@ module RelatonNist
|
|
44
91
|
end
|
45
92
|
|
46
93
|
doc = Nokogiri::HTML OpenURI.open_uri(::Addressable::URI.parse(url).normalize)
|
47
|
-
|
94
|
+
doc.css("table.publications-table > tbody > tr").map do |h|
|
48
95
|
link = h.at("td/div/strong/a")
|
49
96
|
serie = h.at("td[1]").text.strip
|
50
97
|
code = h.at("td[2]").text.strip
|
@@ -59,39 +106,59 @@ module RelatonNist
|
|
59
106
|
}, self
|
60
107
|
)
|
61
108
|
end
|
62
|
-
|
63
|
-
|
64
|
-
|
109
|
+
end
|
110
|
+
|
111
|
+
# Fetches data form json
|
112
|
+
# @param docid [String]
|
113
|
+
def from_json(docid, **opts)
|
114
|
+
data.select do |doc|
|
115
|
+
if year
|
116
|
+
d = Date.strptime year, "%Y"
|
117
|
+
idate = RelatonNist.parse_date doc["issued-date"]
|
118
|
+
next unless idate.between? d, d.next_year.prev_day
|
119
|
+
end
|
120
|
+
if /PD/ =~ opts[:stage]
|
121
|
+
next unless %w[draft-public draft-prelim].include? doc["status"]
|
65
122
|
else
|
66
|
-
|
123
|
+
next unless doc["status"] == "final"
|
67
124
|
end
|
125
|
+
doc["docidentifier"] =~ Regexp.new(docid)
|
126
|
+
end.map do |h|
|
127
|
+
/(?<serie>(?<=-)\w+$)/ =~ h["series"]
|
128
|
+
title = [h["title-main"], h["title-sub"]].compact.join " - "
|
129
|
+
release_date = RelatonNist.parse_date h["published-date"]
|
130
|
+
Hit.new(
|
131
|
+
{
|
132
|
+
code: h["docidentifier"], serie: serie.upcase, title: title,
|
133
|
+
url: h["uri"], status: h["status"], release_date: release_date,
|
134
|
+
json: h
|
135
|
+
}, self
|
136
|
+
)
|
68
137
|
end
|
69
|
-
concat hits
|
70
|
-
# concat(hits.map { |h| Hit.new(h, self) })
|
71
|
-
@fetched = false
|
72
|
-
# @hit_pages = hit_pages
|
73
138
|
end
|
74
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
75
139
|
|
76
|
-
#
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
140
|
+
# Fetches json data
|
141
|
+
# @return [Hash]
|
142
|
+
def data
|
143
|
+
ctime = File.ctime DATAFILE if File.exist? DATAFILE
|
144
|
+
if !ctime || ctime.to_date < Date.today
|
145
|
+
resp = OpenURI.open_uri("https://csrc.nist.gov/CSRC/media/feeds/metanorma/pubs-export.meta")
|
146
|
+
if !ctime || ctime < resp.last_modified
|
147
|
+
@data = nil
|
148
|
+
zip = OpenURI.open_uri "https://csrc.nist.gov/CSRC/media/feeds/metanorma/pubs-export.zip"
|
149
|
+
FileUtils.mv zip.path, DATAFILE
|
150
|
+
end
|
82
151
|
end
|
83
|
-
|
84
|
-
workers.result
|
85
|
-
@fetched = true
|
86
|
-
self
|
87
|
-
end
|
152
|
+
return if @data
|
88
153
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
154
|
+
Zip::File.open(DATAFILE) do |zf|
|
155
|
+
zf.each do |f|
|
156
|
+
@data = JSON.parse f.get_input_stream.read
|
157
|
+
break
|
158
|
+
end
|
159
|
+
end
|
160
|
+
@data
|
95
161
|
end
|
162
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
96
163
|
end
|
97
164
|
end
|
@@ -15,8 +15,7 @@ module RelatonNist
|
|
15
15
|
def search(text, year = nil, opts = {})
|
16
16
|
HitCollection.new text, year, opts
|
17
17
|
rescue OpenURI::HTTPError, SocketError
|
18
|
-
|
19
|
-
[]
|
18
|
+
raise RelatonBib::RequestError, "Could not access https://www.nist.gov"
|
20
19
|
end
|
21
20
|
|
22
21
|
# @param code [String] the NIST standard Code to look up (e..g "8200")
|
@@ -83,26 +82,22 @@ module RelatonNist
|
|
83
82
|
# @retur [Hash]
|
84
83
|
def nistbib_results_filter(result, year, opts)
|
85
84
|
missed_years = []
|
85
|
+
iter = opts[:stage]&.slice(-3, 1)
|
86
|
+
iteration = case iter
|
87
|
+
when "I" then "1"
|
88
|
+
when "F" then "final"
|
89
|
+
else iter
|
90
|
+
end
|
86
91
|
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
87
92
|
fetch_pages(s, 3).each_with_index do |r, _i|
|
88
93
|
if opts[:issued_date]
|
89
|
-
r.dates.select { |d| d.type == "issued"
|
90
|
-
|
91
|
-
end
|
94
|
+
ids = r.dates.select { |d| d.type == "issued" && d.on == opts[:issued_date] }
|
95
|
+
next if ids.empty?
|
92
96
|
elsif opts[:updated_date]
|
93
|
-
r.dates.select { |d| d.type == "published"
|
94
|
-
|
95
|
-
end
|
96
|
-
end
|
97
|
-
if opts[:stage]
|
98
|
-
iter = opts[:stage][-3]
|
99
|
-
iteration = case iter
|
100
|
-
when "I" then 1
|
101
|
-
when "F" then "final"
|
102
|
-
else iter.to_i
|
103
|
-
end
|
104
|
-
next if iter && r.status.iteration != iteration
|
97
|
+
pds = r.dates.select { |d| d.type == "published" && d.on == opts[:updated_date] }
|
98
|
+
next if pds.empty?
|
105
99
|
end
|
100
|
+
next if iter && r.status.iteration != iteration
|
106
101
|
return { ret: r } if !year
|
107
102
|
|
108
103
|
r.dates.select { |d| d.type == "published" }.each do |d|
|
@@ -11,23 +11,55 @@ module RelatonNist
|
|
11
11
|
# @param hit_data [Hash]
|
12
12
|
# @return [Hash]
|
13
13
|
def parse_page(hit_data)
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
item_data = if hit_data[:json]
|
15
|
+
from_json hit_data
|
16
|
+
else
|
17
|
+
from_csrs hit_data
|
18
|
+
end
|
17
19
|
doctype = "standard"
|
18
20
|
titles = fetch_titles(hit_data)
|
19
|
-
unless /^(SP|NISTIR|FIPS)
|
20
|
-
doctype = id_cleanup(docid[0].id)
|
21
|
-
docid[0] = RelatonBib::DocumentIdentifier.new(
|
21
|
+
unless /^(SP|NISTIR|FIPS) / =~ item_data[:docid][0].id
|
22
|
+
doctype = id_cleanup(item_data[:docid][0].id)
|
23
|
+
item_data[:docid][0] = RelatonBib::DocumentIdentifier.new(
|
24
|
+
id: titles[0][:content], type: "NIST",
|
25
|
+
)
|
22
26
|
end
|
27
|
+
item_data[:fetched] = Date.today.to_s
|
28
|
+
item_data[:type] = "standard"
|
29
|
+
item_data[:titles] = titles
|
30
|
+
item_data[:doctype] = doctype
|
31
|
+
|
32
|
+
NistBibliographicItem.new(**item_data)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def from_json(hit_data)
|
38
|
+
json = hit_data[:json]
|
39
|
+
{
|
40
|
+
link: fetch_link(json),
|
41
|
+
docid: fetch_docid(json["docidentifier"]),
|
42
|
+
dates: fetch_dates(json, hit_data[:release_date]),
|
43
|
+
contributors: fetch_contributors(json),
|
44
|
+
edition: fetch_edition(json),
|
45
|
+
language: [json["language"]],
|
46
|
+
script: [json["script"]],
|
47
|
+
# abstract: fetch_abstract(doc),
|
48
|
+
docstatus: fetch_status(json, hit_data[:status]),
|
49
|
+
copyright: fetch_copyright(json["published-date"]),
|
50
|
+
relations: fetch_relations_json(json),
|
51
|
+
# series: fetch_series(json),
|
52
|
+
keyword: fetch_keywords(json),
|
53
|
+
commentperiod: fetch_commentperiod_json(json),
|
54
|
+
}
|
55
|
+
end
|
23
56
|
|
24
|
-
|
25
|
-
|
26
|
-
|
57
|
+
def from_csrs(hit_data)
|
58
|
+
doc = get_page hit_data[:url]
|
59
|
+
{
|
27
60
|
# id: fetch_id(doc),
|
28
|
-
titles: titles,
|
29
61
|
link: fetch_link(doc),
|
30
|
-
docid:
|
62
|
+
docid: fetch_docid(doc),
|
31
63
|
dates: fetch_dates(doc, hit_data[:release_date]),
|
32
64
|
contributors: fetch_contributors(doc),
|
33
65
|
edition: fetch_edition(hit_data[:code]),
|
@@ -40,8 +72,7 @@ module RelatonNist
|
|
40
72
|
series: fetch_series(doc),
|
41
73
|
keyword: fetch_keywords(doc),
|
42
74
|
commentperiod: fetch_commentperiod(doc),
|
43
|
-
|
44
|
-
)
|
75
|
+
}
|
45
76
|
end
|
46
77
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
47
78
|
|
@@ -52,8 +83,6 @@ module RelatonNist
|
|
52
83
|
id.sub(/ \(WITHDRAWN\)/, "").sub(/ \(([^) ]+ )?DRAFT\)/i, "")
|
53
84
|
end
|
54
85
|
|
55
|
-
private
|
56
|
-
|
57
86
|
# Get page.
|
58
87
|
# @param path [String] page's path
|
59
88
|
# @return [Array<Nokogiri::HTML::Document, String>]
|
@@ -61,16 +90,23 @@ module RelatonNist
|
|
61
90
|
uri = URI url
|
62
91
|
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
63
92
|
Nokogiri::HTML(resp.body)
|
93
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
94
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
95
|
+
raise RelatonBib::RequestError, "Could not access #{url}"
|
64
96
|
end
|
65
97
|
|
66
98
|
# Fetch docid.
|
67
|
-
# @param doc [Nokogiri::HTML::Document]
|
99
|
+
# @param doc [Nokogiri::HTML::Document, String]
|
68
100
|
# @return [Array<RelatonBib::DocumentIdentifier>]
|
69
101
|
def fetch_docid(doc)
|
70
|
-
item_ref = doc.
|
71
|
-
|
72
|
-
|
73
|
-
|
102
|
+
item_ref = if doc.is_a? String
|
103
|
+
doc
|
104
|
+
else
|
105
|
+
doc.at(
|
106
|
+
"//div[contains(@class, 'publications-detail')]/h3",
|
107
|
+
)&.text&.strip
|
108
|
+
end
|
109
|
+
item_ref ||= "?"
|
74
110
|
[RelatonBib::DocumentIdentifier.new(id: item_ref, type: "NIST")]
|
75
111
|
end
|
76
112
|
|
@@ -83,56 +119,48 @@ module RelatonNist
|
|
83
119
|
# end
|
84
120
|
|
85
121
|
# Fetch status.
|
86
|
-
# @param doc [Nokogiri::HTML::Document]
|
122
|
+
# @param doc [Nokogiri::HTML::Document, Hash]
|
87
123
|
# @param status [String]
|
88
|
-
# @return [
|
124
|
+
# @return [RelatonNist::DocumentStatus]
|
89
125
|
def fetch_status(doc, status)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
when "retired draft"
|
95
|
-
stage = "draft-public"
|
96
|
-
subst = "retired"
|
97
|
-
when "withdrawn"
|
98
|
-
stage = "final"
|
99
|
-
subst = "withdrawn"
|
100
|
-
when "draft"
|
101
|
-
stage = "draft-public"
|
102
|
-
subst = "active"
|
126
|
+
if doc.is_a? Hash
|
127
|
+
stage = doc["status"]
|
128
|
+
subst = doc["substage"]
|
129
|
+
iter = doc["iteration"] == "initial" ? 1 : doc["iteration"]
|
103
130
|
else
|
104
|
-
|
105
|
-
|
106
|
-
|
131
|
+
case status
|
132
|
+
when "draft (withdrawn)"
|
133
|
+
stage = "draft-public"
|
134
|
+
subst = "withdrawn"
|
135
|
+
when "retired draft"
|
136
|
+
stage = "draft-public"
|
137
|
+
subst = "retired"
|
138
|
+
when "withdrawn"
|
139
|
+
stage = "final"
|
140
|
+
subst = "withdrawn"
|
141
|
+
when "draft"
|
142
|
+
stage = "draft-public"
|
143
|
+
subst = "active"
|
144
|
+
else
|
145
|
+
stage = status
|
146
|
+
subst = "active"
|
147
|
+
end
|
148
|
+
|
149
|
+
iter = nil
|
150
|
+
if stage.include? "draft"
|
151
|
+
iter = 1
|
152
|
+
history = doc.xpath("//span[@id='pub-history-container']/a"\
|
153
|
+
"|//span[@id='pub-history-container']/span")
|
154
|
+
history.each_with_index do |h, idx|
|
155
|
+
next if h.name == "a"
|
107
156
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
history = doc.xpath("//span[@id='pub-history-container']/a"\
|
112
|
-
"|//span[@id='pub-history-container']/span")
|
113
|
-
history.each_with_index do |h, idx|
|
114
|
-
next if h.name == "a"
|
115
|
-
|
116
|
-
iter = idx + 1 if idx.positive?
|
117
|
-
# iter = if lsif idx < (history.size - 1) && !history.last.text.include?("Draft")
|
118
|
-
# "final"
|
119
|
-
# elsif idx.positive? then idx + 1
|
120
|
-
# end
|
121
|
-
break
|
157
|
+
iter = idx + 1 if idx.positive?
|
158
|
+
break
|
159
|
+
end
|
122
160
|
end
|
123
161
|
end
|
124
162
|
|
125
|
-
|
126
|
-
# substage = "withdrawn"
|
127
|
-
# else
|
128
|
-
# substage = "active"
|
129
|
-
# item_ref = doc.at(
|
130
|
-
# "//div[contains(@class, 'publications-detail')]/h3",
|
131
|
-
# ).text.strip
|
132
|
-
# wip = item_ref.match(/(?<=\()\w+/).to_s
|
133
|
-
# stage = "draft-public" if wip == "DRAFT"
|
134
|
-
# end
|
135
|
-
RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter
|
163
|
+
RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter.to_s
|
136
164
|
end
|
137
165
|
|
138
166
|
# Fetch titles.
|
@@ -144,46 +172,87 @@ module RelatonNist
|
|
144
172
|
|
145
173
|
# Fetch dates
|
146
174
|
# @param doc [Nokogiri::HTML::Document]
|
175
|
+
# @param release_date [Date]
|
147
176
|
# @return [Array<Hash>]
|
148
177
|
def fetch_dates(doc, release_date)
|
149
178
|
dates = [{ type: "published", on: release_date.to_s }]
|
150
179
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
180
|
+
if doc.is_a? Hash
|
181
|
+
issued = RelatonNist.parse_date doc["issued-date"]
|
182
|
+
updated = RelatonNist.parse_date doc["updated-date"]
|
183
|
+
dates << { type: "updated", on: updated.to_s } if updated
|
184
|
+
obsoleted = RelatonNist.parse_date doc["obsoleted-date"]
|
185
|
+
dates << { type: "obsoleted", on: obsoleted.to_s } if obsoleted
|
186
|
+
else
|
187
|
+
d = doc.at("//span[@id='pub-release-date']").text.strip
|
188
|
+
issued = RelatonNist.parse_date d
|
189
|
+
end
|
190
|
+
dates << { type: "issued", on: issued.to_s }
|
159
191
|
dates
|
160
192
|
end
|
161
193
|
|
194
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
195
|
+
# @param doc [Nokogiri::HTML::Document, Hash]
|
196
|
+
# @return [Array<RelatonBib::ContributionInfo>]
|
162
197
|
def fetch_contributors(doc)
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
198
|
+
contribs = []
|
199
|
+
if doc.is_a? Hash
|
200
|
+
contribs += contributors_json(
|
201
|
+
doc["authors"], "author", doc["language"], doc["script"]
|
202
|
+
)
|
203
|
+
contribs + contributors_json(
|
204
|
+
doc["editors"], "editor", doc["language"], doc["script"]
|
205
|
+
)
|
206
|
+
else
|
207
|
+
name = "National Institute of Standards and Technology"
|
208
|
+
org = RelatonBib::Organization.new(
|
209
|
+
name: name, url: "www.nist.gov", abbreviation: "NIST",
|
210
|
+
)
|
211
|
+
contribs << RelatonBib::ContributionInfo.new(entity: org, role: ["publisher"])
|
212
|
+
authors = doc.at('//h4[.="Author(s)"]/following-sibling::p')
|
213
|
+
contribs += contributors(authors, "author")
|
214
|
+
editors = doc.at('//h4[.="Editor(s)"]/following-sibling::p')
|
215
|
+
contribs + contributors(editors, "editor")
|
216
|
+
end
|
217
|
+
end
|
173
218
|
|
174
|
-
|
175
|
-
|
219
|
+
# @param doc [Array<Hash>]
|
220
|
+
# @param role [String]
|
221
|
+
# @return [Array<RelatonBib::ContributionInfo>]
|
222
|
+
def contributors_json(doc, role, lang = "en", script = "Latn")
|
223
|
+
doc.map do |contr|
|
224
|
+
if contr["affiliation"]
|
225
|
+
if contr["affiliation"]["acronym"]
|
226
|
+
abbrev = RelatonBib::LocalizedString.new(contr["affiliation"]["acronym"])
|
227
|
+
end
|
228
|
+
org = RelatonBib::Organization.new(
|
229
|
+
name: contr["affiliation"]["name"], abbreviation: abbrev,
|
230
|
+
)
|
231
|
+
end
|
232
|
+
if contr["surname"]
|
233
|
+
affiliation = RelatonBib::Affilation.new org
|
234
|
+
entity = RelatonBib::Person.new(
|
235
|
+
name: full_name(contr, lang, script), affiliation: [affiliation],
|
236
|
+
)
|
237
|
+
else
|
238
|
+
entity = org
|
239
|
+
end
|
240
|
+
RelatonBib::ContributionInfo.new entity: entity, role: [role]
|
241
|
+
end
|
176
242
|
end
|
177
243
|
|
178
244
|
# rubocop:disable Metrics/CyclomaticComplexity
|
179
|
-
|
245
|
+
# @param doc [Nokogiri::HTML::Element, Array<Hash>]
|
246
|
+
# @param role [String]
|
247
|
+
# @return [Array<RelatonBib::ContributionInfo>]
|
248
|
+
def contributors(doc, role, lang = "en", script = "Latn")
|
180
249
|
return [] if doc.nil?
|
181
250
|
|
182
251
|
doc.text.split(", ").map do |contr|
|
183
252
|
/(?<an>.+?)(\s+\((?<abbrev>.+?)\))?$/ =~ contr
|
184
253
|
if abbrev && an.downcase !~ /(task|force|group)/ && an.split.size.between?(2, 3)
|
185
254
|
fullname = RelatonBib::FullName.new(
|
186
|
-
completename: RelatonBib::LocalizedString.new(an,
|
255
|
+
completename: RelatonBib::LocalizedString.new(an, lang, script),
|
187
256
|
)
|
188
257
|
case abbrev
|
189
258
|
when "NIST"
|
@@ -199,7 +268,7 @@ module RelatonNist
|
|
199
268
|
org = RelatonBib::Organization.new name: org_name, url: url, abbreviation: abbrev
|
200
269
|
affiliation = RelatonBib::Affilation.new org
|
201
270
|
entity = RelatonBib::Person.new(
|
202
|
-
name: fullname, affiliation: [affiliation],
|
271
|
+
name: fullname, affiliation: [affiliation],
|
203
272
|
)
|
204
273
|
else
|
205
274
|
entity = RelatonBib::Organization.new name: an, abbreviation: abbrev
|
@@ -207,17 +276,49 @@ module RelatonNist
|
|
207
276
|
RelatonBib::ContributionInfo.new entity: entity, role: [role]
|
208
277
|
end
|
209
278
|
end
|
210
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
279
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/AbcSize, Metrics/MethodLength
|
280
|
+
|
281
|
+
# @param name [Hash]
|
282
|
+
# @param lang [Strong]
|
283
|
+
# @param script [String]
|
284
|
+
# @return [RelatonBib::FullName]
|
285
|
+
def full_name(name, lang, script)
|
286
|
+
RelatonBib::FullName.new(
|
287
|
+
surname: RelatonBib::LocalizedString.new(name["surname"], lang, script),
|
288
|
+
forenames: name_parts(name["givenName"], lang, script),
|
289
|
+
additions: name_parts(name["suffix"], lang, script),
|
290
|
+
prefix: name_parts(name["title"], lang, script),
|
291
|
+
completename: RelatonBib::LocalizedString.new(name["fullName"], lang, script),
|
292
|
+
)
|
293
|
+
end
|
211
294
|
|
212
|
-
|
213
|
-
|
295
|
+
# @param part [String, NilClass]
|
296
|
+
# @param lang [Strong]
|
297
|
+
# @param script [String]
|
298
|
+
# @return [Array<RelatonBib::LocalizedString>]
|
299
|
+
def name_parts(part, lang, script)
|
300
|
+
return [] unless part
|
301
|
+
|
302
|
+
[RelatonBib::LocalizedString.new(name[part], lang, script)]
|
303
|
+
end
|
304
|
+
|
305
|
+
# @param doc [String, Hash]
|
306
|
+
# @return [String, NilClass]
|
307
|
+
def fetch_edition(doc)
|
308
|
+
if doc.is_a? Hash
|
309
|
+
return unless doc["edition"]
|
310
|
+
|
311
|
+
rev = doc["edition"]
|
312
|
+
else
|
313
|
+
return unless /(?<=Rev\.\s)(?<rev>\d+)/ =~ doc
|
314
|
+
end
|
214
315
|
|
215
316
|
"Revision #{rev}"
|
216
317
|
end
|
217
318
|
|
218
319
|
# Fetch abstracts.
|
219
320
|
# @param doc [Nokigiri::HTML::Document]
|
220
|
-
# @return [Array<
|
321
|
+
# @return [Array<Hash>]
|
221
322
|
def fetch_abstract(doc)
|
222
323
|
abstract_content = doc.xpath('//div[contains(@class, "pub-abstract-callout")]/div[1]/p').text
|
223
324
|
[{
|
@@ -229,58 +330,82 @@ module RelatonNist
|
|
229
330
|
end
|
230
331
|
|
231
332
|
# Fetch copyright.
|
232
|
-
# @param
|
333
|
+
# @param doc [Nokogiri::HTL::Document, String]
|
233
334
|
# @return [Hash]
|
234
335
|
def fetch_copyright(doc)
|
235
336
|
name = "National Institute of Standards and Technology"
|
236
337
|
url = "www.nist.gov"
|
237
|
-
d = doc.
|
338
|
+
d = if doc.is_a? String then doc
|
339
|
+
else
|
340
|
+
doc.at("//span[@id='pub-release-date']").text.strip
|
341
|
+
end
|
238
342
|
from = d.match(/\d{4}/).to_s
|
239
343
|
{ owner: { name: name, abbreviation: "NIST", url: url }, from: from }
|
240
344
|
end
|
241
345
|
|
242
346
|
# Fetch links.
|
243
|
-
# @param doc [Nokogiri::HTML::Document]
|
347
|
+
# @param doc [Nokogiri::HTML::Document, Hash]
|
244
348
|
# @return [Array<Hash>]
|
245
349
|
def fetch_link(doc)
|
246
|
-
pub = doc.at "//p/strong[.='Publication:']"
|
247
350
|
links = []
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
351
|
+
if doc.is_a? Hash
|
352
|
+
links << { type: "uri", content: doc["uri"] } if doc["uri"]
|
353
|
+
doi = "https://doi.org/" + doc["doi"] if doc["doi"]
|
354
|
+
else
|
355
|
+
pub = doc.at "//p/strong[.='Publication:']"
|
356
|
+
pdf = pub.at "./following-sibling::a[.=' Local Download']"
|
357
|
+
doi = pub.at("./following-sibling::a[contains(.,'(DOI)')]")&.attr :href
|
358
|
+
links << { type: "pdf", content: pdf[:href] } if pdf
|
359
|
+
end
|
360
|
+
links << { type: "doi", content: doi } if doi
|
252
361
|
links
|
253
362
|
end
|
254
363
|
|
255
364
|
# Fetch relations.
|
256
365
|
# @param doc [Nokogiri::HTML::Document]
|
257
|
-
# @return [Array<
|
366
|
+
# @return [Array<RelatonBib::DocumentRelation>]
|
258
367
|
def fetch_relations(doc)
|
259
368
|
relations = doc.xpath('//span[@id="pub-supersedes-container"]/a').map do |r|
|
260
|
-
doc_relation "supersedes", r
|
369
|
+
doc_relation "supersedes", r.text, DOMAIN + r[:href]
|
261
370
|
end
|
262
371
|
|
263
372
|
relations += doc.xpath('//span[@id="pub-part-container"]/a').map do |r|
|
264
|
-
doc_relation "partOf", r
|
373
|
+
doc_relation "partOf", r.text, DOMAIN + r[:href]
|
265
374
|
end
|
266
375
|
|
267
376
|
relations + doc.xpath('//span[@id="pub-related-container"]/a').map do |r|
|
268
|
-
doc_relation "updates", r
|
377
|
+
doc_relation "updates", r.text, DOMAIN + r[:href]
|
269
378
|
end
|
270
379
|
end
|
271
380
|
|
272
|
-
def
|
381
|
+
def fetch_relations_json(doc)
|
382
|
+
relations = doc["supersedes"].map do |r|
|
383
|
+
doc_relation "supersedes", r["docidentifier"], r["uri"]
|
384
|
+
end
|
385
|
+
|
386
|
+
relations + doc["superseded-by"].map do |r|
|
387
|
+
doc_relation "updates", r["docidentifier"], r["uri"]
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
# @param type [String]
|
392
|
+
# @param ref [String]
|
393
|
+
# @param uri [String]
|
394
|
+
# @return [RelatonBib::DocumentRelation]
|
395
|
+
def doc_relation(type, ref, uri, lang = "en", script = "Latn")
|
273
396
|
RelatonBib::DocumentRelation.new(
|
274
397
|
type: type,
|
275
398
|
bibitem: RelatonBib::BibliographicItem.new(
|
276
399
|
formattedref: RelatonBib::FormattedRef.new(
|
277
|
-
content: ref
|
400
|
+
content: ref, language: lang, script: script, format: "text/plain",
|
278
401
|
),
|
279
|
-
link: [RelatonBib::TypedUri.new(type: "src", content:
|
402
|
+
link: [RelatonBib::TypedUri.new(type: "src", content: uri)],
|
280
403
|
),
|
281
404
|
)
|
282
405
|
end
|
283
406
|
|
407
|
+
# @param doc [Nokogiri::HTML::Document]
|
408
|
+
# @return [Array<RelatonBib::Series>]
|
284
409
|
def fetch_series(doc)
|
285
410
|
series = doc.xpath "//span[@id='pub-history-container']/a"\
|
286
411
|
"|//span[@id='pub-history-container']/span"
|
@@ -305,11 +430,19 @@ module RelatonNist
|
|
305
430
|
end.select { |s| s }
|
306
431
|
end
|
307
432
|
|
433
|
+
# @param doc [Nokogiri::HTML::Document, Hash]
|
434
|
+
# @return [Array<RelatonNist::Keyword>]
|
308
435
|
def fetch_keywords(doc)
|
309
|
-
kws = doc.
|
310
|
-
|
436
|
+
kws = if doc.is_a? Hash
|
437
|
+
doc["keywords"]
|
438
|
+
else
|
439
|
+
doc.xpath "//span[@id='pub-keywords-container']/span"
|
440
|
+
end
|
441
|
+
kws.map { |kw| Keyword.new kw.is_a?(String) ? kw : kw.text }
|
311
442
|
end
|
312
443
|
|
444
|
+
# @param doc [Nokogiri::HTML::Document]
|
445
|
+
# @return [RelatonNist::CommentPeriod, NilClass]
|
313
446
|
def fetch_commentperiod(doc)
|
314
447
|
cp = doc.at "//span[@id='pub-comments-due']"
|
315
448
|
return unless cp
|
@@ -324,6 +457,12 @@ module RelatonNist
|
|
324
457
|
extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
|
325
458
|
CommentPeriod.new from, to, extended
|
326
459
|
end
|
460
|
+
|
461
|
+
# @param json [Hash]
|
462
|
+
# @return [RelatonNist::CommentPeriod, NilClass]
|
463
|
+
def fetch_commentperiod_json(json)
|
464
|
+
CommentPeriod.new json["comment-from"], json["comment-to"] if json["comment-from"]
|
465
|
+
end
|
327
466
|
end
|
328
467
|
end
|
329
468
|
end
|
data/lib/relaton_nist/version.rb
CHANGED
data/relaton_nist.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-nist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -178,6 +178,20 @@ dependencies:
|
|
178
178
|
- - "~>"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 0.2.0
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: rubyzip
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :runtime
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
181
195
|
description: 'RelatonNist: retrive NIST standards.'
|
182
196
|
email:
|
183
197
|
- open.source@ribose.com
|
@@ -200,6 +214,7 @@ files:
|
|
200
214
|
- lib/relaton/processor.rb
|
201
215
|
- lib/relaton_nist.rb
|
202
216
|
- lib/relaton_nist/comment_period.rb
|
217
|
+
- lib/relaton_nist/data/pubs-export.zip
|
203
218
|
- lib/relaton_nist/document_status.rb
|
204
219
|
- lib/relaton_nist/hit.rb
|
205
220
|
- lib/relaton_nist/hit_collection.rb
|
@@ -230,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
230
245
|
version: '0'
|
231
246
|
requirements: []
|
232
247
|
rubyforge_project:
|
233
|
-
rubygems_version: 2.
|
248
|
+
rubygems_version: 2.6.12
|
234
249
|
signing_key:
|
235
250
|
specification_version: 4
|
236
251
|
summary: 'RelatonNist: retrive NIST standards.'
|