relaton-nist 1.9.1 → 1.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d440504b749834c27875dc8a1ee84346451578db5dd1588b75eafdb31087b64c
4
- data.tar.gz: ecfb305d172a8afa171fa4493b1ffec2dd458fd413143a7f197034a392ddf451
3
+ metadata.gz: 76cfec1954dfd58bf622a6a47e34bdbcfe4cf471106eb7ad19582654078a6d38
4
+ data.tar.gz: 9769f406d5b98840b5bc099d78ab811388170d1fae5872211a7a0c9ed26d869f
5
5
  SHA512:
6
- metadata.gz: 7af96feb236bddbabd7e90982637b608813d5c2163c7453f217c658035cf34b0ef80adc6441d6be213b77e62b5625d95fb10a30914fc94a1b6cf966b1c921bf6
7
- data.tar.gz: 9b9daa02fc2ec7df33ad5c8cdf768c711b1bac0122191e7cfc79ee7c8ed2b7de00fbe6509b3347f387cf4868b24448f91f1a687e45b80fe65ce31d3f7a46cad5
6
+ metadata.gz: 0e3deefff699c77103afb631a03283697dc05f5c952b60b3602961df37df2095102b16c32e229aeeceb76da12cfda09e43e6b0788b6a4c1f02b74dea8a49f818
7
+ data.tar.gz: 3244707d2595b5b45bab973399a04b8cc791d3b008e2ef05fcc99d9355cec6915556aea8528e5f992e4ba0452af82bda09fb93737741a72c08a64e7a0039d161
data/.gitignore CHANGED
@@ -7,6 +7,7 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
  .vscode/
10
+ /data/
10
11
  .rubocop-https---raw-githubusercontent-com-riboseinc-oss-guides-master-ci-rubocop-yml
11
12
 
12
13
  # rspec failure tracking
data/README.adoc CHANGED
@@ -159,6 +159,25 @@ RelatonNist::NistBibliographicItem.from_hash hash
159
159
  ...
160
160
  ----
161
161
 
162
+ === Fetch data
163
+
164
+ This gem uses the https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml dataset as one of data sources.
165
+
166
+ The method `RelatonNist::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
167
+ Arguments:
168
+
169
+ - `output` - folder to save documents (default './data').
170
+ - `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
171
+
172
+ [source,ruby]
173
+ ----
174
+ RelatonNist::DataFetcher.fetch
175
+ Started at: 2021-09-01 18:01:01 +0200
176
+ Stopped at: 2021-09-01 18:01:43 +0200
177
+ Done in: 42 sec.
178
+ => nil
179
+ ----
180
+
162
181
  == Development
163
182
 
164
183
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'English'
4
+ # require 'mechanize'
5
+ # require "fileutils"
6
+ require "yaml"
7
+ # require "open-uri"
8
+ # require "nokogiri"
9
+ # require "relaton_nist"
10
+
11
+ module RelatonNist
12
+ class DataFetcher
13
+ RELATION_TYPES = {
14
+ "replaces" => "obsoletes",
15
+ "isVersionOf" => "editionOf",
16
+ "hasTranslation" => "hasTranslation",
17
+ "isTranslationOf" => "translatedFrom",
18
+ "hasPreprint" => "hasReprint",
19
+ "isSupplementTo" => "complements",
20
+ }.freeze
21
+ URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"
22
+
23
+ def initialize(output, format)
24
+ @output = output
25
+ @format = format
26
+ end
27
+
28
+ def parse_docid(doc)
29
+ doi = doc.at("doi_data/doi").text
30
+ id = doc.at("publisher_item/item_number", "publisher_item/identifier").text.sub(%r{^/}, "")
31
+ case doi
32
+ when "10.6028/NBS.CIRC.12e2revjune" then id.sub!("13e", "12e")
33
+ when "10.6028/NBS.CIRC.36e2" then id.sub!("46e", "36e")
34
+ when "10.6028/NBS.HB.67suppJune1967" then id.sub!("1965", "1967")
35
+ when "10.6028/NBS.HB.105-1r1990" then id.sub!("105-1-1990", "105-1r1990")
36
+ when "10.6028/NIST.HB.150-10-1995" then id.sub!(/150-10$/, "150-10-1995")
37
+ end
38
+ [{ type: "NIST", id: id }, { type: "DOI", id: doi }]
39
+ end
40
+
41
+ # @param doc [Nokogiri::XML::Element]
42
+ # @return [Array<RelatonBib::DocumentIdentifier>]
43
+ def fetch_docid(doc)
44
+ parse_docid(doc).map do |id|
45
+ RelatonBib::DocumentIdentifier.new(type: id[:type], id: id[:id])
46
+ end
47
+ end
48
+
49
+ # @param doc [Nokogiri::XML::Element]
50
+ # @return [RelatonBib::TypedTitleStringCollection, Array]
51
+ def fetch_title(doc)
52
+ t = doc.xpath("titles/title|titles/subtitle")
53
+ return [] unless t.any?
54
+
55
+ RelatonBib::TypedTitleString.from_string t.map(&:text).join(" "), "en", "Latn"
56
+ end
57
+
58
+ # @param doc [Nokogiri::XML::Element]
59
+ # @return [Array<RelatonBib::BibliographicDate>]
60
+ def fetch_date(doc)
61
+ doc.xpath("publication_date|approval_date").map do |dt|
62
+ on = dt.at("year").text
63
+ if (m = dt.at "month")
64
+ on += "-#{m.text}"
65
+ d = dt.at "day"
66
+ on += "-#{d.text}" if d
67
+ end
68
+ type = dt.name == "publication_date" ? "published" : "confirmed"
69
+ RelatonBib::BibliographicDate.new(type: type, on: on)
70
+ end
71
+ end
72
+
73
+ # @param doc [Nokogiri::XML::Element]
74
+ # @return [String]
75
+ def fetch_edition(doc)
76
+ doc.at("edition_number")&.text
77
+ end
78
+
79
+ # @param doc [Nokogiri::XML::Element]
80
+ # @return [Array<Hash>]
81
+ def fetch_relation(doc)
82
+ ns = "http://www.crossref.org/relations.xsd"
83
+ doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel|
84
+ doi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns)
85
+ # ref = doi_to_id doi.text
86
+ # ref, = parse_docid doc
87
+ fref = RelatonBib::FormattedRef.new content: doi.text
88
+ bibitem = RelatonBib::BibliographicItem.new formattedref: fref
89
+ type = RELATION_TYPES[doi["relationship-type"]]
90
+ { type: type, bibitem: bibitem }
91
+ end
92
+ end
93
+
94
+ # @param doc [Nokogiri::XML::Element]
95
+ # @return [Array<RelatonBib::TypedUri>]
96
+ def fetch_link(doc)
97
+ url = doc.at("doi_data/resource").text
98
+ [RelatonBib::TypedUri.new(type: "doi", content: url)]
99
+ end
100
+
101
+ # @param doc [Nokogiri::XML::Element]
102
+ # @return [Array<RelatonBib::FormattedString>]
103
+ def fetch_abstract(doc)
104
+ doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a|
105
+ RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
106
+ end
107
+ end
108
+
109
+ # @param doc [Nokogiri::XML::Element]
110
+ # @return [Array<Hash>]
111
+ def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
112
+ contribs = doc.xpath("contributors/person_name").map do |p|
113
+ forename = []
114
+ initial = []
115
+ p.at("given_name")&.text&.split&.each do |fn|
116
+ if /^(?<init>\w)\.?$/ =~ fn
117
+ initial << RelatonBib::LocalizedString.new(init, doc["language"], "Latn")
118
+ else
119
+ forename << RelatonBib::LocalizedString.new(fn, doc["language"], "Latn")
120
+ end
121
+ end
122
+ sname = p.at("surname").text
123
+ surname = RelatonBib::LocalizedString.new sname, doc["language"], "Latn"
124
+ initial = []
125
+ ident = p.xpath("ORCID").map do |id|
126
+ RelatonBib::PersonIdentifier.new "orcid", id.text
127
+ end
128
+ fullname = RelatonBib::FullName.new(
129
+ surname: surname, forename: forename, initial: initial, identifier: ident,
130
+ )
131
+ person = RelatonBib::Person.new name: fullname
132
+ { entity: person, role: [{ type: p["contributor_role"] }] }
133
+ end
134
+ contribs + doc.xpath("publisher").map do |p|
135
+ abbr = p.at("../institution/institution_acronym")&.text
136
+ org = RelatonBib::Organization.new(name: p.at("publisher_name").text, abbreviation: abbr)
137
+ { entity: org, role: [{ type: "publisher" }] }
138
+ end
139
+ end
140
+
141
+ # @param doc [Nokogiri::XML::Element]
142
+ # @return [Array<String>]
143
+ def fetch_place(doc)
144
+ doc.xpath("institution/institution_place").map(&:text)
145
+ end
146
+
147
+ #
148
+ # Save document
149
+ #
150
+ # @param bib [RelatonNist::NistBibliographicItem]
151
+ #
152
+ def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
153
+ id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
154
+ file = File.join(@output, "#{id}.#{@format}")
155
+ if File.exist? file
156
+ warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
157
+ # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
158
+ else
159
+ output = case @format
160
+ when "yaml" then bib.to_hash.to_yaml
161
+ when "xml" then bib.to_xml bibdata: true
162
+ end
163
+ File.write file, output, encoding: "UTF-8"
164
+ end
165
+ end
166
+
167
+ #
168
+ # Create a document instance an save it.
169
+ #
170
+ # @param doc [Nokogiri::XML::Element]
171
+ #
172
+ # @raise [StandardError]
173
+ #
174
+ def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
175
+ # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
176
+ item = RelatonNist::NistBibliographicItem.new(
177
+ type: "standard", docid: fetch_docid(doc), title: fetch_title(doc),
178
+ link: fetch_link(doc), abstract: fetch_abstract(doc),
179
+ date: fetch_date(doc), edition: fetch_edition(doc),
180
+ contributor: fetch_contributor(doc), relation: fetch_relation(doc),
181
+ place: fetch_place(doc),
182
+ language: [doc["language"]], script: ["Latn"], doctype: "standard"
183
+ )
184
+ write_file item
185
+ rescue StandardError => e
186
+ warn "Document: #{doc.at('doi').text}"
187
+ warn e.message
188
+ raise e
189
+ end
190
+
191
+ #
192
+ # Fetch all the documnts from dataset
193
+ #
194
+ def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
195
+ t1 = Time.now
196
+ puts "Started at: #{t1}"
197
+
198
+ docs = Nokogiri::XML OpenURI.open_uri URL
199
+ FileUtils.mkdir @output unless Dir.exist? @output
200
+ FileUtils.rm Dir[File.join(@output, "*.#{@format}")]
201
+ docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
202
+ .each { |doc| parse_doc doc }
203
+
204
+ t2 = Time.now
205
+ puts "Stopped at: #{t2}"
206
+ puts "Done in: #{(t2 - t1).round} sec."
207
+ rescue StandardError => e
208
+ warn e.message
209
+ end
210
+
211
+ #
212
+ # Fetch all the documnts from dataset
213
+ #
214
+ # @param [String] output foldet name to save the documents
215
+ # @param [String] format format to save the documents
216
+ #
217
+ def self.fetch(output: "data", format: "yaml")
218
+ new(output, format).fetch
219
+ end
220
+ end
221
+ end
@@ -15,17 +15,19 @@ module RelatonNist
15
15
  DATAFILE = File.expand_path "pubs-export.zip", DATAFILEDIR
16
16
  GHNISTDATA = "https://raw.githubusercontent.com/relaton/relaton-data-nist/main/data/"
17
17
 
18
- # @param ref_nbr [String]
19
- # @param year [String]
20
- # @param opts [Hash]
21
- # @option opts [String] :stage
22
- def initialize(ref_nbr, year = nil, opts = {}) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
23
- super ref_nbr, year
24
-
25
- # /(?<docid>(?:SP|FIPS)\s[0-9-]+)/ =~ text
18
+ def self.search(text, year = nil, opts = {})
19
+ new(text, year).search(opts)
20
+ end
21
+
22
+ def search(opts)
26
23
  @array = from_json(**opts)
27
24
  @array = from_ga unless @array.any?
25
+ sort_hits!
26
+ end
28
27
 
28
+ private
29
+
30
+ def sort_hits!
29
31
  @array.sort! do |a, b|
30
32
  if a.sort_value == b.sort_value
31
33
  (b.hit[:release_date] - a.hit[:release_date]).to_i
@@ -33,10 +35,9 @@ module RelatonNist
33
35
  b.sort_value - a.sort_value
34
36
  end
35
37
  end
38
+ self
36
39
  end
37
40
 
38
- private
39
-
40
41
  def from_ga # rubocop:disable Metrics/AbcSize
41
42
  fn = text.gsub(%r{[/\s:.]}, "_").upcase
42
43
  yaml = OpenURI.open_uri "#{GHNISTDATA}#{fn}.yaml"
@@ -14,7 +14,7 @@ module RelatonNist
14
14
  # @param text [String]
15
15
  # @return [RelatonNist::HitCollection]
16
16
  def search(text, year = nil, opts = {})
17
- HitCollection.new text, year, opts
17
+ HitCollection.search text, year, opts
18
18
  rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError => e
19
19
  raise RelatonBib::RequestError, e.message
20
20
  end
@@ -7,6 +7,7 @@ module RelatonNist
7
7
  @prefix = "NIST"
8
8
  @defaultprefix = %r{^(NIST|NISTGCR|ITL Bulletin|JPCRD|NISTIR|CSRC|FIPS)(/[^\s])?\s}
9
9
  @idtype = "NIST"
10
+ @datasets = %w[nist-tech-pubs]
10
11
  end
11
12
 
12
13
  # @param code [String]
@@ -17,6 +18,10 @@ module RelatonNist
17
18
  ::RelatonNist::NistBibliography.get(code, date, opts)
18
19
  end
19
20
 
21
+ def fetch_data(_source, opts)
22
+ DataFetcher.fetch(**opts)
23
+ end
24
+
20
25
  # @param xml [String]
21
26
  # @return [RelatonNist::GbBibliographicItem]
22
27
  def from_xml(xml)
@@ -1,3 +1,3 @@
1
1
  module RelatonNist
2
- VERSION = "1.9.1".freeze
2
+ VERSION = "1.9.2".freeze
3
3
  end
data/lib/relaton_nist.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "relaton_nist/version"
2
2
  require "relaton_nist/nist_bibliography"
3
+ require "relaton_nist/data_fetcher"
3
4
 
4
5
  # if defined? Relaton
5
6
  # require_relative "relaton/processor"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-nist
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.1
4
+ version: 1.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-30 00:00:00.000000000 Z
11
+ date: 2021-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -175,6 +175,7 @@ files:
175
175
  - grammars/reqt.rng
176
176
  - lib/relaton_nist.rb
177
177
  - lib/relaton_nist/comment_period.rb
178
+ - lib/relaton_nist/data_fetcher.rb
178
179
  - lib/relaton_nist/document_relation.rb
179
180
  - lib/relaton_nist/document_status.rb
180
181
  - lib/relaton_nist/hash_converter.rb