relaton-nist 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ require "relaton_bib"
2
+ require "relaton_nist/nist_bibliographic_item"
3
+ require "relaton_nist/scrapper"
4
+ require "relaton_nist/hit_collection"
5
+ require "relaton_nist/xml_parser"
6
+ require "relaton_nist/keyword"
7
+ require "relaton_nist/comment_period"
8
+ require "relaton_nist/document_status"
9
+
10
+ module RelatonNist
11
+ class NistBibliography
12
+ class << self
13
+ # @param text [String]
14
+ # @return [RelatonNist::HitCollection]
15
+ def search(text, year = nil, opts = {})
16
+ HitCollection.new text, year, opts
17
+ rescue OpenURI::HTTPError, SocketError
18
+ warn "Could not access https://www.nist.gov"
19
+ []
20
+ end
21
+
22
+ # @param code [String] the NIST standard Code to look up (e..g "8200")
23
+ # @param year [String] the year the standard was published (optional)
24
+ #
25
+ # @param opts [Hash] options
26
+ # @option opts [TrueClass, FalseClass] :all_parts restricted to all parts
27
+ # if all-parts reference is required
28
+ # @option opts [TrueClass, FalseClass] :bibdata
29
+ #
30
+ # @return [String] Relaton XML serialisation of reference
31
+ def get(code, year = nil, opts = {})
32
+ /^(?<code2>[^\(]+)(\((?<date2>\w+\s(\d{2},\s)?\d{4})\))?\s?\(?((?<=\()(?<stage>[^\)]+))?/ =~ code
33
+ if code2
34
+ code = code2.strip
35
+ if date2
36
+ if /\w+\s\d{4}/ =~ date2
37
+ opts[:issued_date] = Time.strptime date2, "%B %Y"
38
+ elsif /\w+\s\d{2},\s\d{4}/ =~ date2
39
+ opts[:updated_date] = Time.strptime date2, "%B %d, %Y"
40
+ end
41
+ end
42
+ opts[:stage] = stage if stage
43
+ end
44
+
45
+ if year.nil?
46
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
47
+ unless code1.nil?
48
+ code = code1
49
+ year = year1
50
+ end
51
+ end
52
+
53
+ code += "-1" if opts[:all_parts]
54
+ ret = nistbib_get1(code, year, opts)
55
+ # return nil if ret.nil?
56
+ # ret.to_most_recent_reference unless year || opts[:keep_year]
57
+ # ret.to_all_parts if opts[:all_parts]
58
+ ret
59
+ end
60
+
61
+ private
62
+
63
+ def nistbib_get1(code, year, opts)
64
+ result = nistbib_search_filter(code, year, opts) || (return nil)
65
+ ret = nistbib_results_filter(result, year, opts)
66
+ return ret[:ret] if ret[:ret]
67
+
68
+ fetch_ref_err(code, year, ret[:years])
69
+ end
70
+
71
+ # Sort through the results from RelatonNist, fetching them three at a time,
72
+ # and return the first result that matches the code,
73
+ # matches the year (if provided), and which # has a title (amendments do not).
74
+ # Only expects the first page of results to be populated.
75
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
76
+ # If no match, returns any years which caused mismatch, for error reporting
77
+ #
78
+ # @param opts [Hash] options
79
+ # @option opts [Time] :issued_date
80
+ # @option opts [Time] :issued_date
81
+ # @option opts [String] :stage
82
+ #
83
+ # @retur [Hash]
84
+ def nistbib_results_filter(result, year, opts)
85
+ missed_years = []
86
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
87
+ fetch_pages(s, 3).each_with_index do |r, _i|
88
+ if opts[:issued_date]
89
+ r.dates.select { |d| d.type == "issued" }.each do |d|
90
+ next unless opts[:issued_date] == d.on
91
+ end
92
+ elsif opts[:updated_date]
93
+ r.dates.select { |d| d.type == "published" }.each do |d|
94
+ next unless opts[:updated_date] == d.on
95
+ end
96
+ end
97
+ if opts[:stage]
98
+ iter = opts[:stage][-3]
99
+ iteration = case iter
100
+ when "I" then 1
101
+ when "F" then "final"
102
+ else iter.to_i
103
+ end
104
+ next if iter && r.status.iteration != iteration
105
+ end
106
+ return { ret: r } if !year
107
+
108
+ r.dates.select { |d| d.type == "published" }.each do |d|
109
+ return { ret: r } if year.to_i == d.on.year
110
+
111
+ missed_years << d.on.year
112
+ end
113
+ end
114
+ end
115
+ { years: missed_years }
116
+ end
117
+
118
+ def fetch_pages(s, n)
119
+ workers = RelatonBib::WorkersPool.new n
120
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
121
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
122
+ workers.end
123
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
124
+ end
125
+
126
+ def nistbib_search_filter(code, year, opts)
127
+ docid = code.match(%r{[0-9-]{3,}}).to_s
128
+ serie = code.match(%r{(FISP|SP|NISTIR)(?=\s)})
129
+ warn "fetching #{code}..."
130
+ result = search(code, year, opts)
131
+ result.select do |i|
132
+ i.hit[:code]&.include?(docid) && (!serie || i.hit[:serie] == serie.to_s)
133
+ end
134
+ end
135
+
136
+ def fetch_ref_err(code, year, missed_years)
137
+ id = year ? "#{code}:#{year}" : code
138
+ warn "WARNING: no match found online for #{id}. "\
139
+ "The code must be exactly like it is on the standards website."
140
+ warn "(There was no match for #{year}, though there were matches "\
141
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
142
+ if /\d-\d/ =~ code
143
+ warn "The provided document part may not exist, or the document "\
144
+ "may no longer be published in parts."
145
+ end
146
+ nil
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,329 @@
1
+ require "relaton_bib"
2
+
3
+ module RelatonNist
4
+ class Scrapper
5
+ class << self
6
+ DOMAIN = "https://csrc.nist.gov".freeze
7
+
8
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
9
+
10
+ # Parse page.
11
+ # @param hit_data [Hash]
12
+ # @return [Hash]
13
+ def parse_page(hit_data)
14
+ doc = get_page hit_data[:url]
15
+
16
+ docid = fetch_docid(doc)
17
+ doctype = "standard"
18
+ titles = fetch_titles(hit_data)
19
+ unless /^(SP|NISTIR|FIPS) /.match docid[0].id
20
+ doctype = id_cleanup(docid[0].id)
21
+ docid[0] = RelatonBib::DocumentIdentifier.new(id: titles[0][:content], type: "NIST")
22
+ end
23
+
24
+ NistBibliographicItem.new(
25
+ fetched: Date.today.to_s,
26
+ type: "standard",
27
+ # id: fetch_id(doc),
28
+ titles: titles,
29
+ link: fetch_link(doc),
30
+ docid: docid,
31
+ dates: fetch_dates(doc, hit_data[:release_date]),
32
+ contributors: fetch_contributors(doc),
33
+ edition: fetch_edition(hit_data[:code]),
34
+ language: ["en"],
35
+ script: ["Latn"],
36
+ abstract: fetch_abstract(doc),
37
+ docstatus: fetch_status(doc, hit_data[:status]),
38
+ copyright: fetch_copyright(doc),
39
+ relations: fetch_relations(doc),
40
+ series: fetch_series(doc),
41
+ keyword: fetch_keywords(doc),
42
+ commentperiod: fetch_commentperiod(doc),
43
+ doctype: doctype,
44
+ )
45
+ end
46
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
47
+
48
+ # Strip status from doc id
49
+ # @param id String
50
+ # @return String
51
+ def id_cleanup(id)
52
+ id.sub(/ \(WITHDRAWN\)/, "").sub(/ \(([^) ]+ )?DRAFT\)/i, "")
53
+ end
54
+
55
+ private
56
+
57
+ # Get page.
58
+ # @param path [String] page's path
59
+ # @return [Array<Nokogiri::HTML::Document, String>]
60
+ def get_page(url)
61
+ uri = URI url
62
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
63
+ Nokogiri::HTML(resp.body)
64
+ end
65
+
66
+ # Fetch docid.
67
+ # @param doc [Nokogiri::HTML::Document]
68
+ # @return [Array<RelatonBib::DocumentIdentifier>]
69
+ def fetch_docid(doc)
70
+ item_ref = doc.at("//div[contains(@class, 'publications-detail')]/h3").
71
+ text.strip
72
+ return [RelatonBib::DocumentIdentifier.new(type: "NIST", id: "?")] unless item_ref
73
+
74
+ [RelatonBib::DocumentIdentifier.new(id: item_ref, type: "NIST")]
75
+ end
76
+
77
+ # Fetch id.
78
+ # @param doc [Nokogiri::HTML::Document]
79
+ # @return [String]
80
+ # def fetch_id(doc)
81
+ # doc.at("//div[contains(@class, 'publications-detail')]/h3").text.
82
+ # strip.gsub(/\s/, "")
83
+ # end
84
+
85
+ # Fetch status.
86
+ # @param doc [Nokogiri::HTML::Document]
87
+ # @param status [String]
88
+ # @return [Hash]
89
+ def fetch_status(doc, status)
90
+ case status
91
+ when "draft (withdrawn)"
92
+ stage = "draft-public"
93
+ subst = "withdrawn"
94
+ when "retired draft"
95
+ stage = "draft-public"
96
+ subst = "retired"
97
+ when "withdrawn"
98
+ stage = "final"
99
+ subst = "withdrawn"
100
+ when "draft"
101
+ stage = "draft-public"
102
+ subst = "active"
103
+ else
104
+ stage = status
105
+ subst = "active"
106
+ end
107
+
108
+ iter = nil
109
+ if stage.include? "draft"
110
+ iter = 1
111
+ history = doc.xpath("//span[@id='pub-history-container']/a"\
112
+ "|//span[@id='pub-history-container']/span")
113
+ history.each_with_index do |h, idx|
114
+ next if h.name == "a"
115
+
116
+ iter = idx + 1 if idx.positive?
117
+ # iter = if lsif idx < (history.size - 1) && !history.last.text.include?("Draft")
118
+ # "final"
119
+ # elsif idx.positive? then idx + 1
120
+ # end
121
+ break
122
+ end
123
+ end
124
+
125
+ # if doc.at "//p/strong[text()='Withdrawn:']"
126
+ # substage = "withdrawn"
127
+ # else
128
+ # substage = "active"
129
+ # item_ref = doc.at(
130
+ # "//div[contains(@class, 'publications-detail')]/h3",
131
+ # ).text.strip
132
+ # wip = item_ref.match(/(?<=\()\w+/).to_s
133
+ # stage = "draft-public" if wip == "DRAFT"
134
+ # end
135
+ RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter
136
+ end
137
+
138
+ # Fetch titles.
139
+ # @param hit_data [Hash]
140
+ # @return [Array<Hash>]
141
+ def fetch_titles(hit_data)
142
+ [{ content: hit_data[:title], language: "en", script: "Latn", format: "text/plain" }]
143
+ end
144
+
145
+ # Fetch dates
146
+ # @param doc [Nokogiri::HTML::Document]
147
+ # @return [Array<Hash>]
148
+ def fetch_dates(doc, release_date)
149
+ dates = [{ type: "published", on: release_date.to_s }]
150
+
151
+ d = doc.at("//span[@id='pub-release-date']").text.strip
152
+ date = if /(?<date>\w+\s\d{4})/ =~ d
153
+ Date.strptime(date, "%B %Y")
154
+ elsif /(?<date>\w+\s\d{1,2},\s\d{4})/ =~ d
155
+ Date.strptime(date, "%B %d, %Y")
156
+ end
157
+ dates << { type: "issued", on: date.to_s }
158
+
159
+ dates
160
+ end
161
+
162
+ def fetch_contributors(doc)
163
+ name = "National Institute of Standards and Technology"
164
+ org = RelatonBib::Organization.new(
165
+ name: name, url: "www.nist.gov", abbreviation: "NIST",
166
+ )
167
+ contribs = [
168
+ RelatonBib::ContributionInfo.new(entity: org, role: ["publisher"]),
169
+ ]
170
+
171
+ authors = doc.at('//h4[.="Author(s)"]/following-sibling::p')
172
+ contribs += contributors(authors, "author")
173
+
174
+ editors = doc.at('//h4[.="Editor(s)"]/following-sibling::p')
175
+ contribs + contributors(editors, "editor")
176
+ end
177
+
178
+ # rubocop:disable Metrics/CyclomaticComplexity
179
+ def contributors(doc, role)
180
+ return [] if doc.nil?
181
+
182
+ doc.text.split(", ").map do |contr|
183
+ /(?<an>.+?)(\s+\((?<abbrev>.+?)\))?$/ =~ contr
184
+ if abbrev && an.downcase !~ /(task|force|group)/ && an.split.size.between?(2, 3)
185
+ fullname = RelatonBib::FullName.new(
186
+ completename: RelatonBib::LocalizedString.new(an, "en", "Latn"),
187
+ )
188
+ case abbrev
189
+ when "NIST"
190
+ org_name = "National Institute of Standards and Technology"
191
+ url = "www.nist.gov"
192
+ when "MITRE"
193
+ org_name = abbrev
194
+ url = "www.mitre.org"
195
+ else
196
+ org_name = abbrev
197
+ url = nil
198
+ end
199
+ org = RelatonBib::Organization.new name: org_name, url: url, abbreviation: abbrev
200
+ affiliation = RelatonBib::Affilation.new org
201
+ entity = RelatonBib::Person.new(
202
+ name: fullname, affiliation: [affiliation], contacts: [],
203
+ )
204
+ else
205
+ entity = RelatonBib::Organization.new name: an, abbreviation: abbrev
206
+ end
207
+ RelatonBib::ContributionInfo.new entity: entity, role: [role]
208
+ end
209
+ end
210
+ # rubocop:enable Metrics/CyclomaticComplexity
211
+
212
+ def fetch_edition(code)
213
+ return unless /(?<=Rev\.\s)(?<rev>\d+)/ =~ code
214
+
215
+ "Revision #{rev}"
216
+ end
217
+
218
+ # Fetch abstracts.
219
+ # @param doc [Nokigiri::HTML::Document]
220
+ # @return [Array<Array>]
221
+ def fetch_abstract(doc)
222
+ abstract_content = doc.xpath('//div[contains(@class, "pub-abstract-callout")]/div[1]/p').text
223
+ [{
224
+ content: abstract_content,
225
+ language: "en",
226
+ script: "Latn",
227
+ format: "text/plain",
228
+ }]
229
+ end
230
+
231
+ # Fetch copyright.
232
+ # @param title [String]
233
+ # @return [Hash]
234
+ def fetch_copyright(doc)
235
+ name = "National Institute of Standards and Technology"
236
+ url = "www.nist.gov"
237
+ d = doc.at("//span[@id='pub-release-date']").text.strip
238
+ from = d.match(/\d{4}/).to_s
239
+ { owner: { name: name, abbreviation: "NIST", url: url }, from: from }
240
+ end
241
+
242
+ # Fetch links.
243
+ # @param doc [Nokogiri::HTML::Document]
244
+ # @return [Array<Hash>]
245
+ def fetch_link(doc)
246
+ pub = doc.at "//p/strong[.='Publication:']"
247
+ links = []
248
+ pdf = pub.at "./following-sibling::a[.=' Local Download']"
249
+ links << { type: "pdf", content: pdf[:href] } if pdf
250
+ doi = pub.at("./following-sibling::a[contains(.,'(DOI)')]")
251
+ links << { type: "doi", content: doi[:href] } if doi
252
+ links
253
+ end
254
+
255
+ # Fetch relations.
256
+ # @param doc [Nokogiri::HTML::Document]
257
+ # @return [Array<Hash>]
258
+ def fetch_relations(doc)
259
+ relations = doc.xpath('//span[@id="pub-supersedes-container"]/a').map do |r|
260
+ doc_relation "supersedes", r
261
+ end
262
+
263
+ relations += doc.xpath('//span[@id="pub-part-container"]/a').map do |r|
264
+ doc_relation "partOf", r
265
+ end
266
+
267
+ relations + doc.xpath('//span[@id="pub-related-container"]/a').map do |r|
268
+ doc_relation "updates", r
269
+ end
270
+ end
271
+
272
+ def doc_relation(type, ref)
273
+ RelatonBib::DocumentRelation.new(
274
+ type: type,
275
+ bibitem: RelatonBib::BibliographicItem.new(
276
+ formattedref: RelatonBib::FormattedRef.new(
277
+ content: ref.text, language: "en", script: "Latn", format: "text/plain",
278
+ ),
279
+ link: [RelatonBib::TypedUri.new(type: "src", content: DOMAIN + ref[:href])],
280
+ ),
281
+ )
282
+ end
283
+
284
+ def fetch_series(doc)
285
+ series = doc.xpath "//span[@id='pub-history-container']/a"\
286
+ "|//span[@id='pub-history-container']/span"
287
+ series.map.with_index do |s, idx|
288
+ next if s.name == "span"
289
+
290
+ iter = if idx.zero? then "I"
291
+ # elsif status == "final" && idx == (series.size - 1) then "F"
292
+ else idx + 1
293
+ end
294
+
295
+ content = s.text.match(/^[^\(]+/).to_s.strip.gsub " ", " "
296
+
297
+ ref = case content.match(/\w+/).to_s
298
+ when "Draft" then content.match(/(?<=Draft\s).+/).to_s + " (#{iter}PD)"
299
+ end
300
+
301
+ fref = RelatonBib::FormattedRef.new(
302
+ content: ref, language: "en", script: "Latn", format: "text/plain",
303
+ )
304
+ RelatonBib::Series.new(formattedref: fref)
305
+ end.select { |s| s }
306
+ end
307
+
308
+ def fetch_keywords(doc)
309
+ kws = doc.xpath "//span[@id='pub-keywords-container']/span"
310
+ kws.map { |kw| Keyword.new kw.text }
311
+ end
312
+
313
+ def fetch_commentperiod(doc)
314
+ cp = doc.at "//span[@id='pub-comments-due']"
315
+ return unless cp
316
+
317
+ to = Date.strptime cp.text.strip, "%B %d, %Y"
318
+
319
+ d = doc.at("//span[@id='pub-release-date']").text.strip
320
+ from = Date.strptime(d, "%B %Y").to_s
321
+
322
+ ex = doc.at "//strong[contains(.,'The comment closing date has been extended to')]"
323
+ ext = ex&.text&.match(/\w+\s\d{2},\s\d{4}/).to_s
324
+ extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
325
+ CommentPeriod.new from, to, extended
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,3 @@
1
+ module RelatonNist
2
+ VERSION = "0.1.0".freeze
3
+ end
@@ -0,0 +1,47 @@
1
+ module RelatonNist
2
+ class XMLParser < RelatonBib::XMLParser
3
+ class << self
4
+ def from_xml(xml)
5
+ doc = Nokogiri::XML xml
6
+ nistitem = doc.at("/bibitem|/bibdata")
7
+ NistBibliographicItem.new(item_data(nistitem))
8
+ end
9
+
10
+ private
11
+
12
+ def item_data(nistitem)
13
+ data = super
14
+ ext = nistitem.at "./ext"
15
+ return data unless ext
16
+
17
+ data[:keyword] = fetch_keyword(ext)
18
+ data[:commentperiod] = fetch_commentperiod(ext)
19
+ data
20
+ end
21
+
22
+ def fetch_status(item)
23
+ status = item.at "./status"
24
+ return unless status
25
+
26
+ DocumentStatus.new(
27
+ stage: status.at("stage")&.text,
28
+ substage: status.at("substage")&.text,
29
+ iteration: status.at("iteration")&.text,
30
+ )
31
+ end
32
+
33
+ def fetch_commentperiod(item)
34
+ cp = item.at "./commentperiod"
35
+ return unless cp
36
+
37
+ CommentPeriod.new cp.at("from").text, cp.at("to")&.text, cp.at("extended")&.text
38
+ end
39
+
40
+ def fetch_keyword(item)
41
+ item.xpath("./keyword").map do |kw|
42
+ Keyword.new kw.children.first.to_xml
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,12 @@
1
+ require "relaton_nist/version"
2
+ require "relaton_nist/nist_bibliography"
3
+
4
+ if defined? Relaton
5
+ require_relative "relaton/processor"
6
+ Relaton::Registry.instance.register(Relaton::RelatonNist::Processor)
7
+ end
8
+
9
+ module RelatonNist
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,39 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "relaton_nist/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "relaton-nist"
7
+ spec.version = RelatonNist::VERSION
8
+ spec.authors = ["Ribose Inc."]
9
+ spec.email = ["open.source@ribose.com"]
10
+
11
+ spec.summary = "RelatonNist: retrive NIST standards."
12
+ spec.description = "RelatonNist: retrive NIST standards."
13
+ spec.homepage = "https://github.com/metanorma/relaton-nist"
14
+ spec.license = "MIT"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
25
+
26
+ spec.add_development_dependency "bundler", "~> 2.0"
27
+ spec.add_development_dependency "byebug"
28
+ spec.add_development_dependency "debase"
29
+ spec.add_development_dependency "equivalent-xml", "~> 0.6"
30
+ spec.add_development_dependency "pry-byebug"
31
+ spec.add_development_dependency "rake", "~> 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
33
+ spec.add_development_dependency "ruby-debug-ide"
34
+ spec.add_development_dependency "simplecov"
35
+ spec.add_development_dependency "vcr"
36
+ spec.add_development_dependency "webmock"
37
+
38
+ spec.add_dependency "relaton-bib", "~> 0.1.6"
39
+ end