relaton-nist 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,150 @@
1
+ require "relaton_bib"
2
+ require "relaton_nist/nist_bibliographic_item"
3
+ require "relaton_nist/scrapper"
4
+ require "relaton_nist/hit_collection"
5
+ require "relaton_nist/xml_parser"
6
+ require "relaton_nist/keyword"
7
+ require "relaton_nist/comment_period"
8
+ require "relaton_nist/document_status"
9
+
10
+ module RelatonNist
11
+ class NistBibliography
12
+ class << self
13
+ # @param text [String]
14
+ # @return [RelatonNist::HitCollection]
15
+ def search(text, year = nil, opts = {})
16
+ HitCollection.new text, year, opts
17
+ rescue OpenURI::HTTPError, SocketError
18
+ warn "Could not access https://www.nist.gov"
19
+ []
20
+ end
21
+
22
+ # @param code [String] the NIST standard Code to look up (e..g "8200")
23
+ # @param year [String] the year the standard was published (optional)
24
+ #
25
+ # @param opts [Hash] options
26
+ # @option opts [TrueClass, FalseClass] :all_parts restricted to all parts
27
+ # if all-parts reference is required
28
+ # @option opts [TrueClass, FalseClass] :bibdata
29
+ #
30
+ # @return [String] Relaton XML serialisation of reference
31
+ def get(code, year = nil, opts = {})
32
+ /^(?<code2>[^\(]+)(\((?<date2>\w+\s(\d{2},\s)?\d{4})\))?\s?\(?((?<=\()(?<stage>[^\)]+))?/ =~ code
33
+ if code2
34
+ code = code2.strip
35
+ if date2
36
+ if /\w+\s\d{4}/ =~ date2
37
+ opts[:issued_date] = Time.strptime date2, "%B %Y"
38
+ elsif /\w+\s\d{2},\s\d{4}/ =~ date2
39
+ opts[:updated_date] = Time.strptime date2, "%B %d, %Y"
40
+ end
41
+ end
42
+ opts[:stage] = stage if stage
43
+ end
44
+
45
+ if year.nil?
46
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
47
+ unless code1.nil?
48
+ code = code1
49
+ year = year1
50
+ end
51
+ end
52
+
53
+ code += "-1" if opts[:all_parts]
54
+ ret = nistbib_get1(code, year, opts)
55
+ # return nil if ret.nil?
56
+ # ret.to_most_recent_reference unless year || opts[:keep_year]
57
+ # ret.to_all_parts if opts[:all_parts]
58
+ ret
59
+ end
60
+
61
+ private
62
+
63
+ def nistbib_get1(code, year, opts)
64
+ result = nistbib_search_filter(code, year, opts) || (return nil)
65
+ ret = nistbib_results_filter(result, year, opts)
66
+ return ret[:ret] if ret[:ret]
67
+
68
+ fetch_ref_err(code, year, ret[:years])
69
+ end
70
+
71
+ # Sort through the results from RelatonNist, fetching them three at a time,
72
+ # and return the first result that matches the code,
73
+ # matches the year (if provided), and which # has a title (amendments do not).
74
+ # Only expects the first page of results to be populated.
75
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
76
+ # If no match, returns any years which caused mismatch, for error reporting
77
+ #
78
+ # @param opts [Hash] options
79
+ # @option opts [Time] :issued_date
80
+ # @option opts [Time] :issued_date
81
+ # @option opts [String] :stage
82
+ #
83
+ # @retur [Hash]
84
+ def nistbib_results_filter(result, year, opts)
85
+ missed_years = []
86
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
87
+ fetch_pages(s, 3).each_with_index do |r, _i|
88
+ if opts[:issued_date]
89
+ r.dates.select { |d| d.type == "issued" }.each do |d|
90
+ next unless opts[:issued_date] == d.on
91
+ end
92
+ elsif opts[:updated_date]
93
+ r.dates.select { |d| d.type == "published" }.each do |d|
94
+ next unless opts[:updated_date] == d.on
95
+ end
96
+ end
97
+ if opts[:stage]
98
+ iter = opts[:stage][-3]
99
+ iteration = case iter
100
+ when "I" then 1
101
+ when "F" then "final"
102
+ else iter.to_i
103
+ end
104
+ next if iter && r.status.iteration != iteration
105
+ end
106
+ return { ret: r } if !year
107
+
108
+ r.dates.select { |d| d.type == "published" }.each do |d|
109
+ return { ret: r } if year.to_i == d.on.year
110
+
111
+ missed_years << d.on.year
112
+ end
113
+ end
114
+ end
115
+ { years: missed_years }
116
+ end
117
+
118
+ def fetch_pages(s, n)
119
+ workers = RelatonBib::WorkersPool.new n
120
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
121
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
122
+ workers.end
123
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
124
+ end
125
+
126
+ def nistbib_search_filter(code, year, opts)
127
+ docid = code.match(%r{[0-9-]{3,}}).to_s
128
+ serie = code.match(%r{(FISP|SP|NISTIR)(?=\s)})
129
+ warn "fetching #{code}..."
130
+ result = search(code, year, opts)
131
+ result.select do |i|
132
+ i.hit[:code]&.include?(docid) && (!serie || i.hit[:serie] == serie.to_s)
133
+ end
134
+ end
135
+
136
+ def fetch_ref_err(code, year, missed_years)
137
+ id = year ? "#{code}:#{year}" : code
138
+ warn "WARNING: no match found online for #{id}. "\
139
+ "The code must be exactly like it is on the standards website."
140
+ warn "(There was no match for #{year}, though there were matches "\
141
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
142
+ if /\d-\d/ =~ code
143
+ warn "The provided document part may not exist, or the document "\
144
+ "may no longer be published in parts."
145
+ end
146
+ nil
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,329 @@
1
+ require "relaton_bib"
2
+
3
+ module RelatonNist
4
+ class Scrapper
5
+ class << self
6
+ DOMAIN = "https://csrc.nist.gov".freeze
7
+
8
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
9
+
10
+ # Parse page.
11
+ # @param hit_data [Hash]
12
+ # @return [Hash]
13
+ def parse_page(hit_data)
14
+ doc = get_page hit_data[:url]
15
+
16
+ docid = fetch_docid(doc)
17
+ doctype = "standard"
18
+ titles = fetch_titles(hit_data)
19
+ unless /^(SP|NISTIR|FIPS) /.match docid[0].id
20
+ doctype = id_cleanup(docid[0].id)
21
+ docid[0] = RelatonBib::DocumentIdentifier.new(id: titles[0][:content], type: "NIST")
22
+ end
23
+
24
+ NistBibliographicItem.new(
25
+ fetched: Date.today.to_s,
26
+ type: "standard",
27
+ # id: fetch_id(doc),
28
+ titles: titles,
29
+ link: fetch_link(doc),
30
+ docid: docid,
31
+ dates: fetch_dates(doc, hit_data[:release_date]),
32
+ contributors: fetch_contributors(doc),
33
+ edition: fetch_edition(hit_data[:code]),
34
+ language: ["en"],
35
+ script: ["Latn"],
36
+ abstract: fetch_abstract(doc),
37
+ docstatus: fetch_status(doc, hit_data[:status]),
38
+ copyright: fetch_copyright(doc),
39
+ relations: fetch_relations(doc),
40
+ series: fetch_series(doc),
41
+ keyword: fetch_keywords(doc),
42
+ commentperiod: fetch_commentperiod(doc),
43
+ doctype: doctype,
44
+ )
45
+ end
46
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
47
+
48
+ # Strip status from doc id
49
+ # @param id String
50
+ # @return String
51
+ def id_cleanup(id)
52
+ id.sub(/ \(WITHDRAWN\)/, "").sub(/ \(([^) ]+ )?DRAFT\)/i, "")
53
+ end
54
+
55
+ private
56
+
57
+ # Get page.
58
+ # @param path [String] page's path
59
+ # @return [Array<Nokogiri::HTML::Document, String>]
60
+ def get_page(url)
61
+ uri = URI url
62
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
63
+ Nokogiri::HTML(resp.body)
64
+ end
65
+
66
+ # Fetch docid.
67
+ # @param doc [Nokogiri::HTML::Document]
68
+ # @return [Array<RelatonBib::DocumentIdentifier>]
69
+ def fetch_docid(doc)
70
+ item_ref = doc.at("//div[contains(@class, 'publications-detail')]/h3").
71
+ text.strip
72
+ return [RelatonBib::DocumentIdentifier.new(type: "NIST", id: "?")] unless item_ref
73
+
74
+ [RelatonBib::DocumentIdentifier.new(id: item_ref, type: "NIST")]
75
+ end
76
+
77
+ # Fetch id.
78
+ # @param doc [Nokogiri::HTML::Document]
79
+ # @return [String]
80
+ # def fetch_id(doc)
81
+ # doc.at("//div[contains(@class, 'publications-detail')]/h3").text.
82
+ # strip.gsub(/\s/, "")
83
+ # end
84
+
85
+ # Fetch status.
86
+ # @param doc [Nokogiri::HTML::Document]
87
+ # @param status [String]
88
+ # @return [Hash]
89
+ def fetch_status(doc, status)
90
+ case status
91
+ when "draft (withdrawn)"
92
+ stage = "draft-public"
93
+ subst = "withdrawn"
94
+ when "retired draft"
95
+ stage = "draft-public"
96
+ subst = "retired"
97
+ when "withdrawn"
98
+ stage = "final"
99
+ subst = "withdrawn"
100
+ when "draft"
101
+ stage = "draft-public"
102
+ subst = "active"
103
+ else
104
+ stage = status
105
+ subst = "active"
106
+ end
107
+
108
+ iter = nil
109
+ if stage.include? "draft"
110
+ iter = 1
111
+ history = doc.xpath("//span[@id='pub-history-container']/a"\
112
+ "|//span[@id='pub-history-container']/span")
113
+ history.each_with_index do |h, idx|
114
+ next if h.name == "a"
115
+
116
+ iter = idx + 1 if idx.positive?
117
+ # iter = if lsif idx < (history.size - 1) && !history.last.text.include?("Draft")
118
+ # "final"
119
+ # elsif idx.positive? then idx + 1
120
+ # end
121
+ break
122
+ end
123
+ end
124
+
125
+ # if doc.at "//p/strong[text()='Withdrawn:']"
126
+ # substage = "withdrawn"
127
+ # else
128
+ # substage = "active"
129
+ # item_ref = doc.at(
130
+ # "//div[contains(@class, 'publications-detail')]/h3",
131
+ # ).text.strip
132
+ # wip = item_ref.match(/(?<=\()\w+/).to_s
133
+ # stage = "draft-public" if wip == "DRAFT"
134
+ # end
135
+ RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter
136
+ end
137
+
138
+ # Fetch titles.
139
+ # @param hit_data [Hash]
140
+ # @return [Array<Hash>]
141
+ def fetch_titles(hit_data)
142
+ [{ content: hit_data[:title], language: "en", script: "Latn", format: "text/plain" }]
143
+ end
144
+
145
+ # Fetch dates
146
+ # @param doc [Nokogiri::HTML::Document]
147
+ # @return [Array<Hash>]
148
+ def fetch_dates(doc, release_date)
149
+ dates = [{ type: "published", on: release_date.to_s }]
150
+
151
+ d = doc.at("//span[@id='pub-release-date']").text.strip
152
+ date = if /(?<date>\w+\s\d{4})/ =~ d
153
+ Date.strptime(date, "%B %Y")
154
+ elsif /(?<date>\w+\s\d{1,2},\s\d{4})/ =~ d
155
+ Date.strptime(date, "%B %d, %Y")
156
+ end
157
+ dates << { type: "issued", on: date.to_s }
158
+
159
+ dates
160
+ end
161
+
162
+ def fetch_contributors(doc)
163
+ name = "National Institute of Standards and Technology"
164
+ org = RelatonBib::Organization.new(
165
+ name: name, url: "www.nist.gov", abbreviation: "NIST",
166
+ )
167
+ contribs = [
168
+ RelatonBib::ContributionInfo.new(entity: org, role: ["publisher"]),
169
+ ]
170
+
171
+ authors = doc.at('//h4[.="Author(s)"]/following-sibling::p')
172
+ contribs += contributors(authors, "author")
173
+
174
+ editors = doc.at('//h4[.="Editor(s)"]/following-sibling::p')
175
+ contribs + contributors(editors, "editor")
176
+ end
177
+
178
+ # rubocop:disable Metrics/CyclomaticComplexity
179
+ def contributors(doc, role)
180
+ return [] if doc.nil?
181
+
182
+ doc.text.split(", ").map do |contr|
183
+ /(?<an>.+?)(\s+\((?<abbrev>.+?)\))?$/ =~ contr
184
+ if abbrev && an.downcase !~ /(task|force|group)/ && an.split.size.between?(2, 3)
185
+ fullname = RelatonBib::FullName.new(
186
+ completename: RelatonBib::LocalizedString.new(an, "en", "Latn"),
187
+ )
188
+ case abbrev
189
+ when "NIST"
190
+ org_name = "National Institute of Standards and Technology"
191
+ url = "www.nist.gov"
192
+ when "MITRE"
193
+ org_name = abbrev
194
+ url = "www.mitre.org"
195
+ else
196
+ org_name = abbrev
197
+ url = nil
198
+ end
199
+ org = RelatonBib::Organization.new name: org_name, url: url, abbreviation: abbrev
200
+ affiliation = RelatonBib::Affilation.new org
201
+ entity = RelatonBib::Person.new(
202
+ name: fullname, affiliation: [affiliation], contacts: [],
203
+ )
204
+ else
205
+ entity = RelatonBib::Organization.new name: an, abbreviation: abbrev
206
+ end
207
+ RelatonBib::ContributionInfo.new entity: entity, role: [role]
208
+ end
209
+ end
210
+ # rubocop:enable Metrics/CyclomaticComplexity
211
+
212
+ def fetch_edition(code)
213
+ return unless /(?<=Rev\.\s)(?<rev>\d+)/ =~ code
214
+
215
+ "Revision #{rev}"
216
+ end
217
+
218
+ # Fetch abstracts.
219
+ # @param doc [Nokigiri::HTML::Document]
220
+ # @return [Array<Array>]
221
+ def fetch_abstract(doc)
222
+ abstract_content = doc.xpath('//div[contains(@class, "pub-abstract-callout")]/div[1]/p').text
223
+ [{
224
+ content: abstract_content,
225
+ language: "en",
226
+ script: "Latn",
227
+ format: "text/plain",
228
+ }]
229
+ end
230
+
231
+ # Fetch copyright.
232
+ # @param title [String]
233
+ # @return [Hash]
234
+ def fetch_copyright(doc)
235
+ name = "National Institute of Standards and Technology"
236
+ url = "www.nist.gov"
237
+ d = doc.at("//span[@id='pub-release-date']").text.strip
238
+ from = d.match(/\d{4}/).to_s
239
+ { owner: { name: name, abbreviation: "NIST", url: url }, from: from }
240
+ end
241
+
242
+ # Fetch links.
243
+ # @param doc [Nokogiri::HTML::Document]
244
+ # @return [Array<Hash>]
245
+ def fetch_link(doc)
246
+ pub = doc.at "//p/strong[.='Publication:']"
247
+ links = []
248
+ pdf = pub.at "./following-sibling::a[.=' Local Download']"
249
+ links << { type: "pdf", content: pdf[:href] } if pdf
250
+ doi = pub.at("./following-sibling::a[contains(.,'(DOI)')]")
251
+ links << { type: "doi", content: doi[:href] } if doi
252
+ links
253
+ end
254
+
255
+ # Fetch relations.
256
+ # @param doc [Nokogiri::HTML::Document]
257
+ # @return [Array<Hash>]
258
+ def fetch_relations(doc)
259
+ relations = doc.xpath('//span[@id="pub-supersedes-container"]/a').map do |r|
260
+ doc_relation "supersedes", r
261
+ end
262
+
263
+ relations += doc.xpath('//span[@id="pub-part-container"]/a').map do |r|
264
+ doc_relation "partOf", r
265
+ end
266
+
267
+ relations + doc.xpath('//span[@id="pub-related-container"]/a').map do |r|
268
+ doc_relation "updates", r
269
+ end
270
+ end
271
+
272
+ def doc_relation(type, ref)
273
+ RelatonBib::DocumentRelation.new(
274
+ type: type,
275
+ bibitem: RelatonBib::BibliographicItem.new(
276
+ formattedref: RelatonBib::FormattedRef.new(
277
+ content: ref.text, language: "en", script: "Latn", format: "text/plain",
278
+ ),
279
+ link: [RelatonBib::TypedUri.new(type: "src", content: DOMAIN + ref[:href])],
280
+ ),
281
+ )
282
+ end
283
+
284
+ def fetch_series(doc)
285
+ series = doc.xpath "//span[@id='pub-history-container']/a"\
286
+ "|//span[@id='pub-history-container']/span"
287
+ series.map.with_index do |s, idx|
288
+ next if s.name == "span"
289
+
290
+ iter = if idx.zero? then "I"
291
+ # elsif status == "final" && idx == (series.size - 1) then "F"
292
+ else idx + 1
293
+ end
294
+
295
+ content = s.text.match(/^[^\(]+/).to_s.strip.gsub " ", " "
296
+
297
+ ref = case content.match(/\w+/).to_s
298
+ when "Draft" then content.match(/(?<=Draft\s).+/).to_s + " (#{iter}PD)"
299
+ end
300
+
301
+ fref = RelatonBib::FormattedRef.new(
302
+ content: ref, language: "en", script: "Latn", format: "text/plain",
303
+ )
304
+ RelatonBib::Series.new(formattedref: fref)
305
+ end.select { |s| s }
306
+ end
307
+
308
+ def fetch_keywords(doc)
309
+ kws = doc.xpath "//span[@id='pub-keywords-container']/span"
310
+ kws.map { |kw| Keyword.new kw.text }
311
+ end
312
+
313
+ def fetch_commentperiod(doc)
314
+ cp = doc.at "//span[@id='pub-comments-due']"
315
+ return unless cp
316
+
317
+ to = Date.strptime cp.text.strip, "%B %d, %Y"
318
+
319
+ d = doc.at("//span[@id='pub-release-date']").text.strip
320
+ from = Date.strptime(d, "%B %Y").to_s
321
+
322
+ ex = doc.at "//strong[contains(.,'The comment closing date has been extended to')]"
323
+ ext = ex&.text&.match(/\w+\s\d{2},\s\d{4}/).to_s
324
+ extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
325
+ CommentPeriod.new from, to, extended
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,3 @@
1
+ module RelatonNist
2
+ VERSION = "0.1.0".freeze
3
+ end
@@ -0,0 +1,47 @@
1
+ module RelatonNist
2
+ class XMLParser < RelatonBib::XMLParser
3
+ class << self
4
+ def from_xml(xml)
5
+ doc = Nokogiri::XML xml
6
+ nistitem = doc.at("/bibitem|/bibdata")
7
+ NistBibliographicItem.new(item_data(nistitem))
8
+ end
9
+
10
+ private
11
+
12
+ def item_data(nistitem)
13
+ data = super
14
+ ext = nistitem.at "./ext"
15
+ return data unless ext
16
+
17
+ data[:keyword] = fetch_keyword(ext)
18
+ data[:commentperiod] = fetch_commentperiod(ext)
19
+ data
20
+ end
21
+
22
+ def fetch_status(item)
23
+ status = item.at "./status"
24
+ return unless status
25
+
26
+ DocumentStatus.new(
27
+ stage: status.at("stage")&.text,
28
+ substage: status.at("substage")&.text,
29
+ iteration: status.at("iteration")&.text,
30
+ )
31
+ end
32
+
33
+ def fetch_commentperiod(item)
34
+ cp = item.at "./commentperiod"
35
+ return unless cp
36
+
37
+ CommentPeriod.new cp.at("from").text, cp.at("to")&.text, cp.at("extended")&.text
38
+ end
39
+
40
+ def fetch_keyword(item)
41
+ item.xpath("./keyword").map do |kw|
42
+ Keyword.new kw.children.first.to_xml
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,12 @@
1
+ require "relaton_nist/version"
2
+ require "relaton_nist/nist_bibliography"
3
+
4
+ if defined? Relaton
5
+ require_relative "relaton/processor"
6
+ Relaton::Registry.instance.register(Relaton::RelatonNist::Processor)
7
+ end
8
+
9
+ module RelatonNist
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,39 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "relaton_nist/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "relaton-nist"
7
+ spec.version = RelatonNist::VERSION
8
+ spec.authors = ["Ribose Inc."]
9
+ spec.email = ["open.source@ribose.com"]
10
+
11
+ spec.summary = "RelatonNist: retrive NIST standards."
12
+ spec.description = "RelatonNist: retrive NIST standards."
13
+ spec.homepage = "https://github.com/metanorma/relaton-nist"
14
+ spec.license = "MIT"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
25
+
26
+ spec.add_development_dependency "bundler", "~> 2.0"
27
+ spec.add_development_dependency "byebug"
28
+ spec.add_development_dependency "debase"
29
+ spec.add_development_dependency "equivalent-xml", "~> 0.6"
30
+ spec.add_development_dependency "pry-byebug"
31
+ spec.add_development_dependency "rake", "~> 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
33
+ spec.add_development_dependency "ruby-debug-ide"
34
+ spec.add_development_dependency "simplecov"
35
+ spec.add_development_dependency "vcr"
36
+ spec.add_development_dependency "webmock"
37
+
38
+ spec.add_dependency "relaton-bib", "~> 0.1.6"
39
+ end