relaton-nist 0.9.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/grammars/basicdoc.rng +12 -55
- data/grammars/biblio.rng +144 -49
- data/grammars/isodoc.rng +592 -29
- data/grammars/isostandard.rng +145 -472
- data/grammars/nist.rng +98 -139
- data/lib/relaton_nist/nist_bibliography.rb +9 -8
- data/lib/relaton_nist/scrapper.rb +24 -12
- data/lib/relaton_nist/version.rb +1 -1
- data/relaton_nist.gemspec +1 -1
- metadata +4 -4
data/grammars/nist.rng
CHANGED
@@ -5,104 +5,14 @@
|
|
5
5
|
we cannot have a new default namespace: we will end up with a grammar with two different
|
6
6
|
namespaces, one for isostandard and one for csand additions. And we do not want that.
|
7
7
|
-->
|
8
|
-
<include href="
|
8
|
+
<include href="isodoc.rng">
|
9
9
|
<start>
|
10
10
|
<ref name="nist-standard"/>
|
11
11
|
</start>
|
12
|
-
<define name="figure">
|
13
|
-
<element name="figure">
|
14
|
-
<attribute name="id">
|
15
|
-
<data type="ID"/>
|
16
|
-
</attribute>
|
17
|
-
<optional>
|
18
|
-
<ref name="tname"/>
|
19
|
-
</optional>
|
20
|
-
<choice>
|
21
|
-
<ref name="image"/>
|
22
|
-
<ref name="pre"/>
|
23
|
-
<oneOrMore>
|
24
|
-
<ref name="subfigure"/>
|
25
|
-
</oneOrMore>
|
26
|
-
</choice>
|
27
|
-
<zeroOrMore>
|
28
|
-
<ref name="fn"/>
|
29
|
-
</zeroOrMore>
|
30
|
-
<optional>
|
31
|
-
<ref name="dl"/>
|
32
|
-
</optional>
|
33
|
-
<zeroOrMore>
|
34
|
-
<ref name="note"/>
|
35
|
-
</zeroOrMore>
|
36
|
-
</element>
|
37
|
-
</define>
|
38
|
-
<define name="subfigure">
|
39
|
-
<element name="figure">
|
40
|
-
<attribute name="id">
|
41
|
-
<data type="ID"/>
|
42
|
-
</attribute>
|
43
|
-
<optional>
|
44
|
-
<ref name="tname"/>
|
45
|
-
</optional>
|
46
|
-
<choice>
|
47
|
-
<ref name="image"/>
|
48
|
-
<ref name="pre"/>
|
49
|
-
</choice>
|
50
|
-
</element>
|
51
|
-
</define>
|
52
12
|
<define name="DocumentType">
|
53
13
|
<value>standard</value>
|
54
14
|
</define>
|
55
|
-
|
56
|
-
<choice>
|
57
|
-
<value>alternative</value>
|
58
|
-
<value>original</value>
|
59
|
-
<value>unofficial</value>
|
60
|
-
<value>subtitle</value>
|
61
|
-
<value>main</value>
|
62
|
-
</choice>
|
63
|
-
</define>
|
64
|
-
<!-- DocRelationType |= "obsoletedBy" | "supersedes" | "supersededBy" -->
|
65
|
-
<define name="DocRelationType">
|
66
|
-
<choice>
|
67
|
-
<value>obsoletes</value>
|
68
|
-
<value>updates</value>
|
69
|
-
<value>updatedBy</value>
|
70
|
-
<value>complements</value>
|
71
|
-
<value>derivedFrom</value>
|
72
|
-
<value>translatedFrom</value>
|
73
|
-
<value>adoptedFrom</value>
|
74
|
-
<value>equivalent</value>
|
75
|
-
<value>identical</value>
|
76
|
-
<value>nonequivalent</value>
|
77
|
-
<value>includedIn</value>
|
78
|
-
<value>includes</value>
|
79
|
-
<value>instance</value>
|
80
|
-
<value>partOf</value>
|
81
|
-
<value>hasDraft</value>
|
82
|
-
<value>obsoletedBy</value>
|
83
|
-
<value>supersedes</value>
|
84
|
-
<value>supersededBy</value>
|
85
|
-
</choice>
|
86
|
-
</define>
|
87
|
-
<!-- BibliographicDateType |= "abandoned" | "superseded" -->
|
88
|
-
<define name="BibliographicDateType">
|
89
|
-
<choice>
|
90
|
-
<value>published</value>
|
91
|
-
<value>accessed</value>
|
92
|
-
<value>created</value>
|
93
|
-
<value>implemented</value>
|
94
|
-
<value>obsoleted</value>
|
95
|
-
<value>confirmed</value>
|
96
|
-
<value>updated</value>
|
97
|
-
<value>issued</value>
|
98
|
-
<value>transmitted</value>
|
99
|
-
<value>copied</value>
|
100
|
-
<value>unchanged</value>
|
101
|
-
<value>circulated</value>
|
102
|
-
<value>abandoned</value>
|
103
|
-
<value>superseded</value>
|
104
|
-
</choice>
|
105
|
-
</define>
|
15
|
+
<!-- TitleType = ( "alternative" | "original" | "unofficial" | "subtitle" | "main" ) -->
|
106
16
|
<define name="preface">
|
107
17
|
<element name="preface">
|
108
18
|
<optional>
|
@@ -112,7 +22,10 @@
|
|
112
22
|
<ref name="foreword"/>
|
113
23
|
</optional>
|
114
24
|
<zeroOrMore>
|
115
|
-
<
|
25
|
+
<choice>
|
26
|
+
<ref name="clause"/>
|
27
|
+
<ref name="errata_clause"/>
|
28
|
+
</choice>
|
116
29
|
</zeroOrMore>
|
117
30
|
<optional>
|
118
31
|
<ref name="reviewernote"/>
|
@@ -122,62 +35,41 @@
|
|
122
35
|
</optional>
|
123
36
|
</element>
|
124
37
|
</define>
|
125
|
-
<define name="
|
126
|
-
<element name="
|
127
|
-
<
|
128
|
-
<
|
129
|
-
|
130
|
-
</attribute>
|
131
|
-
</optional>
|
132
|
-
<optional>
|
133
|
-
<attribute name="language"/>
|
134
|
-
</optional>
|
135
|
-
<optional>
|
136
|
-
<attribute name="script"/>
|
137
|
-
</optional>
|
138
|
-
<optional>
|
139
|
-
<attribute name="inline-header">
|
140
|
-
<data type="boolean"/>
|
141
|
-
</attribute>
|
142
|
-
</optional>
|
143
|
-
<optional>
|
144
|
-
<attribute name="obligation">
|
145
|
-
<choice>
|
146
|
-
<value>normative</value>
|
147
|
-
<value>informative</value>
|
148
|
-
</choice>
|
149
|
-
</attribute>
|
150
|
-
</optional>
|
151
|
-
<optional>
|
152
|
-
<ref name="section-title"/>
|
153
|
-
</optional>
|
154
|
-
<zeroOrMore>
|
155
|
-
<!--
|
156
|
-
allow hanging paragraps in annexes: they introduce lists
|
157
|
-
( paragraph-with-footnote | table | note | formula | admonition | ol | ul | dl | figure | quote | sourcecode | review | example )*,
|
158
|
-
-->
|
159
|
-
<ref name="BasicBlock"/>
|
160
|
-
</zeroOrMore>
|
161
|
-
<zeroOrMore>
|
162
|
-
<ref name="note"/>
|
163
|
-
</zeroOrMore>
|
164
|
-
<choice>
|
165
|
-
<zeroOrMore>
|
166
|
-
<ref name="clause-hanging-paragraph-with-footnote"/>
|
167
|
-
</zeroOrMore>
|
168
|
-
<ref name="terms"/>
|
169
|
-
</choice>
|
38
|
+
<define name="editorialgroup">
|
39
|
+
<element name="editorialgroup">
|
40
|
+
<oneOrMore>
|
41
|
+
<ref name="committee"/>
|
42
|
+
</oneOrMore>
|
170
43
|
</element>
|
171
44
|
</define>
|
172
45
|
<define name="BibDataExtensionType">
|
173
46
|
<optional>
|
174
47
|
<ref name="doctype"/>
|
175
48
|
</optional>
|
49
|
+
<optional>
|
50
|
+
<ref name="editorialgroup"/>
|
51
|
+
</optional>
|
52
|
+
<zeroOrMore>
|
53
|
+
<ref name="ics"/>
|
54
|
+
</zeroOrMore>
|
176
55
|
<optional>
|
177
56
|
<ref name="commentperiod"/>
|
178
57
|
</optional>
|
179
58
|
</define>
|
180
59
|
</include>
|
60
|
+
<define name="DocRelationType" combine="choice">
|
61
|
+
<choice>
|
62
|
+
<value>obsoletedBy</value>
|
63
|
+
<value>supersedes</value>
|
64
|
+
<value>supersededBy</value>
|
65
|
+
</choice>
|
66
|
+
</define>
|
67
|
+
<define name="BibliographicDateType" combine="choice">
|
68
|
+
<choice>
|
69
|
+
<value>abandoned</value>
|
70
|
+
<value>superseded</value>
|
71
|
+
</choice>
|
72
|
+
</define>
|
181
73
|
<define name="commentperiod">
|
182
74
|
<element name="commentperiod">
|
183
75
|
<element name="from">
|
@@ -205,6 +97,71 @@
|
|
205
97
|
<ref name="Basic-Section"/>
|
206
98
|
</element>
|
207
99
|
</define>
|
100
|
+
<define name="committee">
|
101
|
+
<element name="committee">
|
102
|
+
<text/>
|
103
|
+
</element>
|
104
|
+
</define>
|
105
|
+
<define name="errata">
|
106
|
+
<element name="errata">
|
107
|
+
<oneOrMore>
|
108
|
+
<ref name="erratarow"/>
|
109
|
+
</oneOrMore>
|
110
|
+
</element>
|
111
|
+
</define>
|
112
|
+
<define name="erratarow">
|
113
|
+
<element name="row">
|
114
|
+
<element name="date">
|
115
|
+
<ref name="ISO8601Date"/>
|
116
|
+
</element>
|
117
|
+
<element name="type">
|
118
|
+
<text/>
|
119
|
+
</element>
|
120
|
+
<element name="change">
|
121
|
+
<oneOrMore>
|
122
|
+
<ref name="TextElement"/>
|
123
|
+
</oneOrMore>
|
124
|
+
</element>
|
125
|
+
<element name="pages">
|
126
|
+
<text/>
|
127
|
+
</element>
|
128
|
+
</element>
|
129
|
+
</define>
|
130
|
+
<define name="errata_clause">
|
131
|
+
<element name="clause">
|
132
|
+
<optional>
|
133
|
+
<attribute name="id">
|
134
|
+
<data type="ID"/>
|
135
|
+
</attribute>
|
136
|
+
</optional>
|
137
|
+
<optional>
|
138
|
+
<attribute name="language"/>
|
139
|
+
</optional>
|
140
|
+
<optional>
|
141
|
+
<attribute name="script"/>
|
142
|
+
</optional>
|
143
|
+
<optional>
|
144
|
+
<attribute name="obligation">
|
145
|
+
<choice>
|
146
|
+
<value>normative</value>
|
147
|
+
<value>informative</value>
|
148
|
+
</choice>
|
149
|
+
</attribute>
|
150
|
+
</optional>
|
151
|
+
<optional>
|
152
|
+
<ref name="section-title"/>
|
153
|
+
</optional>
|
154
|
+
<group>
|
155
|
+
<zeroOrMore>
|
156
|
+
<ref name="BasicBlock"/>
|
157
|
+
</zeroOrMore>
|
158
|
+
<zeroOrMore>
|
159
|
+
<ref name="note"/>
|
160
|
+
</zeroOrMore>
|
161
|
+
</group>
|
162
|
+
<ref name="errata"/>
|
163
|
+
</element>
|
164
|
+
</define>
|
208
165
|
<define name="nist-standard">
|
209
166
|
<element name="nist-standard">
|
210
167
|
<ref name="bibdata"/>
|
@@ -221,7 +178,9 @@
|
|
221
178
|
<zeroOrMore>
|
222
179
|
<ref name="annex"/>
|
223
180
|
</zeroOrMore>
|
224
|
-
<
|
181
|
+
<optional>
|
182
|
+
<ref name="bibliography"/>
|
183
|
+
</optional>
|
225
184
|
</element>
|
226
185
|
</define>
|
227
186
|
</grammar>
|
@@ -111,21 +111,22 @@ module RelatonNist
|
|
111
111
|
{ years: missed_years }
|
112
112
|
end
|
113
113
|
|
114
|
-
def fetch_pages(
|
115
|
-
workers = RelatonBib::WorkersPool.new
|
114
|
+
def fetch_pages(hits, threads)
|
115
|
+
workers = RelatonBib::WorkersPool.new threads
|
116
116
|
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
117
|
-
|
117
|
+
hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
118
118
|
workers.end
|
119
|
-
workers.result.
|
119
|
+
workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
|
120
120
|
end
|
121
121
|
|
122
122
|
def nistbib_search_filter(code, year, opts)
|
123
|
-
|
123
|
+
idregex = %r{[0-9-]{3,}}
|
124
|
+
docid = code.match(idregex).to_s
|
124
125
|
serie = code.match(%r{(FISP|SP|NISTIR)(?=\s)})
|
125
126
|
warn "[relaton-nist] (\"#{code}\") fetching..."
|
126
127
|
result = search(code, year, opts)
|
127
128
|
result.select do |i|
|
128
|
-
i.hit[:code]&.
|
129
|
+
i.hit[:code]&.match(idregex).to_s == docid && (!serie || i.hit[:serie] == serie.to_s)
|
129
130
|
end
|
130
131
|
end
|
131
132
|
|
@@ -136,8 +137,8 @@ module RelatonNist
|
|
136
137
|
warn "[relaton-nist] (There was no match for #{year}, though there were matches "\
|
137
138
|
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
138
139
|
if /\d-\d/ =~ code
|
139
|
-
warn "[relaton-nist] The provided document part may not exist,
|
140
|
-
"may no longer be published in parts."
|
140
|
+
warn "[relaton-nist] The provided document part may not exist, "\
|
141
|
+
"or the document may no longer be published in parts."
|
141
142
|
end
|
142
143
|
nil
|
143
144
|
end
|
@@ -16,10 +16,10 @@ module RelatonNist
|
|
16
16
|
else
|
17
17
|
from_csrs hit_data
|
18
18
|
end
|
19
|
-
doctype = "standard"
|
19
|
+
# doctype = "standard"
|
20
20
|
titles = fetch_titles(hit_data)
|
21
21
|
unless /^(SP|NISTIR|FIPS) / =~ item_data[:docid][0].id
|
22
|
-
doctype = id_cleanup(item_data[:docid][0].id)
|
22
|
+
# doctype = id_cleanup(item_data[:docid][0].id)
|
23
23
|
item_data[:docid][0] = RelatonBib::DocumentIdentifier.new(
|
24
24
|
id: titles[0][:content].upcase, type: "NIST",
|
25
25
|
)
|
@@ -78,9 +78,9 @@ module RelatonNist
|
|
78
78
|
# Strip status from doc id
|
79
79
|
# @param id String
|
80
80
|
# @return String
|
81
|
-
def id_cleanup(id)
|
82
|
-
|
83
|
-
end
|
81
|
+
# def id_cleanup(id)
|
82
|
+
# id.sub(/ \(WITHDRAWN\)/, "").sub(/ \(([^) ]+ )?DRAFT\)/i, "")
|
83
|
+
# end
|
84
84
|
|
85
85
|
# Get page.
|
86
86
|
# @param path [String] page's path
|
@@ -304,7 +304,7 @@ module RelatonNist
|
|
304
304
|
def name_parts(part, lang, script)
|
305
305
|
return [] unless part
|
306
306
|
|
307
|
-
[RelatonBib::LocalizedString.new(
|
307
|
+
[RelatonBib::LocalizedString.new(part, lang, script)]
|
308
308
|
end
|
309
309
|
|
310
310
|
# @param doc [String, Hash]
|
@@ -325,7 +325,9 @@ module RelatonNist
|
|
325
325
|
# @param doc [Nokigiri::HTML::Document]
|
326
326
|
# @return [Array<Hash>]
|
327
327
|
def fetch_abstract(doc)
|
328
|
-
abstract_content = doc.xpath(
|
328
|
+
abstract_content = doc.xpath(
|
329
|
+
'//div[contains(@class, "pub-abstract-callout")]/div[1]/p',
|
330
|
+
).text
|
329
331
|
[{
|
330
332
|
content: abstract_content,
|
331
333
|
language: "en",
|
@@ -336,7 +338,7 @@ module RelatonNist
|
|
336
338
|
|
337
339
|
# Fetch copyright.
|
338
340
|
# @param doc [Nokogiri::HTL::Document, String]
|
339
|
-
# @return [Hash]
|
341
|
+
# @return [Array<Hash>]
|
340
342
|
def fetch_copyright(doc)
|
341
343
|
name = "National Institute of Standards and Technology"
|
342
344
|
url = "www.nist.gov"
|
@@ -345,9 +347,11 @@ module RelatonNist
|
|
345
347
|
doc.at("//span[@id='pub-release-date']").text.strip
|
346
348
|
end
|
347
349
|
from = d.match(/\d{4}/).to_s
|
348
|
-
{ owner: { name: name, abbreviation: "NIST", url: url }, from: from }
|
350
|
+
[{ owner: [{ name: name, abbreviation: "NIST", url: url }], from: from }]
|
349
351
|
end
|
350
352
|
|
353
|
+
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
354
|
+
|
351
355
|
# Fetch links.
|
352
356
|
# @param doc [Nokogiri::HTML::Document, Hash]
|
353
357
|
# @return [Array<Hash>]
|
@@ -365,6 +369,7 @@ module RelatonNist
|
|
365
369
|
links << { type: "doi", content: doi } if doi
|
366
370
|
links
|
367
371
|
end
|
372
|
+
# rubocop:enable Metrics/MethodLength
|
368
373
|
|
369
374
|
# Fetch relations.
|
370
375
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -382,6 +387,7 @@ module RelatonNist
|
|
382
387
|
doc_relation "updates", r.text, DOMAIN + r[:href]
|
383
388
|
end
|
384
389
|
end
|
390
|
+
# rubocop:enable Metrics/AbcSize
|
385
391
|
|
386
392
|
def fetch_relations_json(doc)
|
387
393
|
relations = doc["supersedes"].map do |r|
|
@@ -409,6 +415,8 @@ module RelatonNist
|
|
409
415
|
)
|
410
416
|
end
|
411
417
|
|
418
|
+
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
419
|
+
|
412
420
|
# @param doc [Nokogiri::HTML::Document]
|
413
421
|
# @return [Array<RelatonBib::Series>]
|
414
422
|
def fetch_series(doc)
|
@@ -418,14 +426,14 @@ module RelatonNist
|
|
418
426
|
next if s.name == "span"
|
419
427
|
|
420
428
|
iter = if idx.zero? then "I"
|
421
|
-
# elsif status == "final" && idx == (series.size - 1) then "F"
|
422
429
|
else idx + 1
|
423
430
|
end
|
424
431
|
|
425
432
|
content = s.text.match(/^[^\(]+/).to_s.strip.squeeze " "
|
426
433
|
|
427
434
|
ref = case s.text
|
428
|
-
when /^Draft/
|
435
|
+
when /^Draft/
|
436
|
+
content.match(/(?<=Draft\s).+/).to_s + " (#{iter}PD)"
|
429
437
|
when /\(Draft\)/ then content + " (#{iter}PD)"
|
430
438
|
else content
|
431
439
|
end
|
@@ -436,6 +444,7 @@ module RelatonNist
|
|
436
444
|
RelatonBib::Series.new(formattedref: fref)
|
437
445
|
end.select { |s| s }
|
438
446
|
end
|
447
|
+
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
439
448
|
|
440
449
|
# @param doc [Nokogiri::HTML::Document, Hash]
|
441
450
|
# @return [Array<RelatonNist::Keyword>]
|
@@ -448,6 +457,7 @@ module RelatonNist
|
|
448
457
|
kws.map { |kw| kw.is_a?(String) ? kw : kw.text }
|
449
458
|
end
|
450
459
|
|
460
|
+
# rubocop:disable Metrics/AbcSize
|
451
461
|
# @param doc [Nokogiri::HTML::Document]
|
452
462
|
# @return [RelatonNist::CommentPeriod, NilClass]
|
453
463
|
def fetch_commentperiod(doc)
|
@@ -459,11 +469,13 @@ module RelatonNist
|
|
459
469
|
d = doc.at("//span[@id='pub-release-date']").text.strip
|
460
470
|
from = Date.strptime(d, "%B %Y").to_s
|
461
471
|
|
462
|
-
ex = doc.at "//strong[contains(.,'The comment closing date has been
|
472
|
+
ex = doc.at "//strong[contains(.,'The comment closing date has been "\
|
473
|
+
"extended to')]"
|
463
474
|
ext = ex&.text&.match(/\w+\s\d{2},\s\d{4}/).to_s
|
464
475
|
extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
|
465
476
|
CommentPeriod.new from: from, to: to, extended: extended
|
466
477
|
end
|
478
|
+
# rubocop:enable Metrics/AbcSize
|
467
479
|
|
468
480
|
# @param json [Hash]
|
469
481
|
# @return [RelatonNist::CommentPeriod, NilClass]
|