relaton-calconnect 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +19 -0
- data/lib/relaton_calconnect/data_fetcher.rb +93 -0
- data/lib/relaton_calconnect/processor.rb +14 -1
- data/lib/relaton_calconnect/scrapper.rb +65 -10
- data/lib/relaton_calconnect/version.rb +1 -1
- data/lib/relaton_calconnect.rb +1 -0
- data/relaton_calconnect.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 276dc8eec3c63bb0992e5986afcf1c45264e4541c5f008697c51f758be583d94
|
4
|
+
data.tar.gz: 50512e178a70c19233a71a7ecfd081efa23128227e4ec9de73d4d270a8c1a1aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3786fc11fc5a004f691d3f5994bd4289ab7f71116cfb0de5f5e3d2477a6460a7b3b4a7073b91bdd9b736f046bad68e824fa22cb983727589a89899d168419615
|
7
|
+
data.tar.gz: a0032955308db628a798ee1f9a45616e49e82799031ee205de6b73298a768fee8ef175835330567ef5709c8a2ba5d853ca0d10c865fdc793baf966c472bb158c
|
data/README.adoc
CHANGED
@@ -106,6 +106,25 @@ RelatonCalconnect::CcBibliographicItem.from_hash hash
|
|
106
106
|
...
|
107
107
|
----
|
108
108
|
|
109
|
+
=== Fetch data
|
110
|
+
|
111
|
+
This gem uses the https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml dataset as one of data sources.
|
112
|
+
|
113
|
+
The method `RelatonCalconnect::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
|
114
|
+
Arguments:
|
115
|
+
|
116
|
+
- `output` - folder to save documents (default './data').
|
117
|
+
- `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
|
118
|
+
|
119
|
+
[source,ruby]
|
120
|
+
----
|
121
|
+
RelatonCalconnect::DataFetcher.fetch
|
122
|
+
Started at: 2021-09-09 16:03:51 +0200
|
123
|
+
Stopped at: 2021-09-09 16:04:12 +0200
|
124
|
+
Done in: 20 sec.
|
125
|
+
=> nil
|
126
|
+
----
|
127
|
+
|
109
128
|
== Development
|
110
129
|
|
111
130
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal:true
|
2
|
+
|
3
|
+
module RelatonCalconnect
|
4
|
+
#
|
5
|
+
# Relaton-calconnect data fetcher
|
6
|
+
#
|
7
|
+
class DataFetcher
|
8
|
+
# DOMAIN = "https://standards.calconnect.org/"
|
9
|
+
# SCHEME, HOST = DOMAIN.split(%r{:?/?/})
|
10
|
+
ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
|
11
|
+
# DATADIR = "data"
|
12
|
+
# DATAFILE = File.join DATADIR, "bibliography.yml"
|
13
|
+
# ETAGFILE = File.join DATADIR, "etag.txt"
|
14
|
+
|
15
|
+
def initialize(output, format)
|
16
|
+
@output = output
|
17
|
+
@etagfile = File.join output, "etag.txt"
|
18
|
+
@format = format
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.fetch(output: "data", format: "yaml")
|
22
|
+
t1 = Time.now
|
23
|
+
puts "Started at: #{t1}"
|
24
|
+
FileUtils.mkdir_p output unless Dir.exist? output
|
25
|
+
new(output, format).fetch
|
26
|
+
t2 = Time.now
|
27
|
+
puts "Stopped at: #{t2}"
|
28
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# fetch data form server and save it to file.
|
33
|
+
#
|
34
|
+
def fetch
|
35
|
+
resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
|
36
|
+
# return if there aren't any changes since last fetching
|
37
|
+
return unless resp.status == 200
|
38
|
+
|
39
|
+
data = YAML.safe_load resp.body
|
40
|
+
all_success = true
|
41
|
+
data["root"]["items"].each do |doc|
|
42
|
+
success = parse_page doc
|
43
|
+
all_success &&= success
|
44
|
+
end
|
45
|
+
self.etag = resp[:etag] if all_success
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
#
|
51
|
+
# Parse document and write it to file
|
52
|
+
#
|
53
|
+
# @param [Hash] doc
|
54
|
+
#
|
55
|
+
def parse_page(doc)
|
56
|
+
bib = Scrapper.parse_page doc
|
57
|
+
# bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
|
58
|
+
write_doc doc["docid"]["id"], bib
|
59
|
+
true
|
60
|
+
rescue StandardError => e
|
61
|
+
warn "Document: #{doc['docid']['id']}"
|
62
|
+
warn e.message
|
63
|
+
puts e.backtrace
|
64
|
+
false
|
65
|
+
end
|
66
|
+
|
67
|
+
def write_doc(docid, bib)
|
68
|
+
content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
|
69
|
+
file = File.join @output, "#{docid.downcase.gsub(%r{[/\s:]}, '_')}.#{@format}"
|
70
|
+
# if File.exist? file
|
71
|
+
# warn "#{file} exist"
|
72
|
+
# else
|
73
|
+
File.write file, content, encoding: "UTF-8"
|
74
|
+
# end
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# Read ETag from file
|
79
|
+
#
|
80
|
+
# @return [String, NilClass]
|
81
|
+
def etag
|
82
|
+
@etag ||= File.exist?(@etagfile) ? File.read(@etagfile, encoding: "UTF-8") : nil
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Save ETag to file
|
87
|
+
#
|
88
|
+
# @param tag [String]
|
89
|
+
def etag=(e_tag)
|
90
|
+
File.write @etagfile, e_tag, encoding: "UTF-8"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -4,11 +4,12 @@ module RelatonCalconnect
|
|
4
4
|
class Processor < Relaton::Processor
|
5
5
|
attr_reader :idtype
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
8
8
|
@short = :relaton_calconnect
|
9
9
|
@prefix = "CC"
|
10
10
|
@defaultprefix = %r{^CC\s}
|
11
11
|
@idtype = "CC"
|
12
|
+
@datasets = %w[calconnect-org]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
@@ -19,6 +20,18 @@ module RelatonCalconnect
|
|
19
20
|
::RelatonCalconnect::CcBibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the documents from a source
|
25
|
+
#
|
26
|
+
# @param [String] _source source name
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonCalconnect::CcBibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -1,17 +1,24 @@
|
|
1
1
|
module RelatonCalconnect
|
2
2
|
module Scrapper
|
3
3
|
DOMAIN = "https://standards.calconnect.org/".freeze
|
4
|
+
SCHEME, HOST = DOMAIN.split(%r{:?/?/})
|
4
5
|
# DOMAIN = "http://127.0.0.1:4000/".freeze
|
5
6
|
|
6
7
|
class << self
|
7
8
|
# papam hit [Hash]
|
8
9
|
# @return [RelatonOgc::OrcBibliographicItem]
|
9
10
|
def parse_page(hit)
|
10
|
-
|
11
|
+
links = array(hit["link"])
|
12
|
+
link = links.detect { |l| l["type"] == "rxl" }
|
11
13
|
if link
|
12
|
-
|
13
|
-
|
14
|
+
bib = fetch_bib_xml link["content"]
|
15
|
+
update_links bib, links
|
16
|
+
# XMLParser.from_xml bib_xml
|
17
|
+
else
|
18
|
+
bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
|
14
19
|
end
|
20
|
+
bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
|
21
|
+
bib
|
15
22
|
end
|
16
23
|
|
17
24
|
private
|
@@ -19,15 +26,25 @@ module RelatonCalconnect
|
|
19
26
|
# @param url [String]
|
20
27
|
# @return [String] XML
|
21
28
|
def fetch_bib_xml(url)
|
29
|
+
# rxl = get_rxl url
|
30
|
+
# uri_rxl = rxl.at("uri[@type='rxl']")
|
31
|
+
# return rxl.to_xml unless uri_rxl
|
32
|
+
|
33
|
+
# uri_xml = rxl.xpath("//uri").to_xml
|
34
|
+
# rxl = get_rxl uri_rxl.text
|
35
|
+
# docid = rxl.at "//docidentifier"
|
36
|
+
# docid.add_previous_sibling uri_xml
|
37
|
+
# rxl.to_xml
|
22
38
|
rxl = get_rxl url
|
23
39
|
uri_rxl = rxl.at("uri[@type='rxl']")
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
rxl.to_xml
|
40
|
+
if uri_rxl
|
41
|
+
uri_xml = rxl.xpath("//uri").to_xml
|
42
|
+
rxl = get_rxl uri_rxl.text
|
43
|
+
docid = rxl.at "//docidentifier"
|
44
|
+
docid.add_previous_sibling uri_xml
|
45
|
+
end
|
46
|
+
xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
|
47
|
+
RelatonCalconnect::XMLParser.from_xml xml
|
31
48
|
end
|
32
49
|
|
33
50
|
# @param path [String]
|
@@ -36,6 +53,44 @@ module RelatonCalconnect
|
|
36
53
|
resp = Faraday.get DOMAIN + path
|
37
54
|
Nokogiri::XML resp.body
|
38
55
|
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Fix editorial group
|
59
|
+
#
|
60
|
+
# @param [Hash] doc
|
61
|
+
#
|
62
|
+
# @return [Hash]
|
63
|
+
#
|
64
|
+
def doc_to_hash(doc)
|
65
|
+
array(doc["editorialgroup"]).each do |eg|
|
66
|
+
tc = eg.delete("technical_committee")
|
67
|
+
eg.merge!(tc) if tc
|
68
|
+
end
|
69
|
+
doc
|
70
|
+
end
|
71
|
+
|
72
|
+
def update_links(bib, links)
|
73
|
+
links.each do |l|
|
74
|
+
tu = l.transform_keys(&:to_sym)
|
75
|
+
bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"])
|
76
|
+
end
|
77
|
+
bib
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Wrap into Array if not Array
|
82
|
+
#
|
83
|
+
# @param [Array, Hash, String, nil] content
|
84
|
+
#
|
85
|
+
# @return [Array<Hash, String>]
|
86
|
+
#
|
87
|
+
def array(content)
|
88
|
+
case content
|
89
|
+
when Array then content
|
90
|
+
when nil then []
|
91
|
+
else [content]
|
92
|
+
end
|
93
|
+
end
|
39
94
|
end
|
40
95
|
end
|
41
96
|
end
|
data/lib/relaton_calconnect.rb
CHANGED
@@ -8,6 +8,7 @@ require "relaton_calconnect/technical_committee"
|
|
8
8
|
require "relaton_calconnect/cc_bibliographic_item"
|
9
9
|
require "relaton_calconnect/xml_parser"
|
10
10
|
require "relaton_calconnect/hash_converter"
|
11
|
+
require "relaton_calconnect/data_fetcher"
|
11
12
|
|
12
13
|
module RelatonCalconnect
|
13
14
|
class Error < StandardError; end
|
data/relaton_calconnect.gemspec
CHANGED
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
28
28
|
|
29
29
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
30
|
-
spec.add_development_dependency "rake", "~>
|
30
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.0"
|
32
32
|
spec.add_development_dependency "ruby-jing"
|
33
33
|
spec.add_development_dependency "simplecov"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-calconnect
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '13.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '13.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/relaton_calconnect.rb
|
164
164
|
- lib/relaton_calconnect/cc_bibliographic_item.rb
|
165
165
|
- lib/relaton_calconnect/cc_bibliography.rb
|
166
|
+
- lib/relaton_calconnect/data_fetcher.rb
|
166
167
|
- lib/relaton_calconnect/hash_converter.rb
|
167
168
|
- lib/relaton_calconnect/hit.rb
|
168
169
|
- lib/relaton_calconnect/hit_collection.rb
|