relaton-calconnect 1.9.0 → 1.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +19 -0
- data/lib/relaton_calconnect/data_fetcher.rb +93 -0
- data/lib/relaton_calconnect/processor.rb +14 -1
- data/lib/relaton_calconnect/scrapper.rb +65 -10
- data/lib/relaton_calconnect/version.rb +1 -1
- data/lib/relaton_calconnect.rb +1 -0
- data/relaton_calconnect.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 276dc8eec3c63bb0992e5986afcf1c45264e4541c5f008697c51f758be583d94
|
4
|
+
data.tar.gz: 50512e178a70c19233a71a7ecfd081efa23128227e4ec9de73d4d270a8c1a1aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3786fc11fc5a004f691d3f5994bd4289ab7f71116cfb0de5f5e3d2477a6460a7b3b4a7073b91bdd9b736f046bad68e824fa22cb983727589a89899d168419615
|
7
|
+
data.tar.gz: a0032955308db628a798ee1f9a45616e49e82799031ee205de6b73298a768fee8ef175835330567ef5709c8a2ba5d853ca0d10c865fdc793baf966c472bb158c
|
data/README.adoc
CHANGED
@@ -106,6 +106,25 @@ RelatonCalconnect::CcBibliographicItem.from_hash hash
|
|
106
106
|
...
|
107
107
|
----
|
108
108
|
|
109
|
+
=== Fetch data
|
110
|
+
|
111
|
+
This gem uses the https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml dataset as one of data sources.
|
112
|
+
|
113
|
+
The method `RelatonCalconnect::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
|
114
|
+
Arguments:
|
115
|
+
|
116
|
+
- `output` - folder to save documents (default './data').
|
117
|
+
- `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
|
118
|
+
|
119
|
+
[source,ruby]
|
120
|
+
----
|
121
|
+
RelatonCalconnect::DataFetcher.fetch
|
122
|
+
Started at: 2021-09-09 16:03:51 +0200
|
123
|
+
Stopped at: 2021-09-09 16:04:12 +0200
|
124
|
+
Done in: 20 sec.
|
125
|
+
=> nil
|
126
|
+
----
|
127
|
+
|
109
128
|
== Development
|
110
129
|
|
111
130
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal:true
|
2
|
+
|
3
|
+
module RelatonCalconnect
|
4
|
+
#
|
5
|
+
# Relaton-calconnect data fetcher
|
6
|
+
#
|
7
|
+
class DataFetcher
|
8
|
+
# DOMAIN = "https://standards.calconnect.org/"
|
9
|
+
# SCHEME, HOST = DOMAIN.split(%r{:?/?/})
|
10
|
+
ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
|
11
|
+
# DATADIR = "data"
|
12
|
+
# DATAFILE = File.join DATADIR, "bibliography.yml"
|
13
|
+
# ETAGFILE = File.join DATADIR, "etag.txt"
|
14
|
+
|
15
|
+
def initialize(output, format)
|
16
|
+
@output = output
|
17
|
+
@etagfile = File.join output, "etag.txt"
|
18
|
+
@format = format
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.fetch(output: "data", format: "yaml")
|
22
|
+
t1 = Time.now
|
23
|
+
puts "Started at: #{t1}"
|
24
|
+
FileUtils.mkdir_p output unless Dir.exist? output
|
25
|
+
new(output, format).fetch
|
26
|
+
t2 = Time.now
|
27
|
+
puts "Stopped at: #{t2}"
|
28
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# fetch data form server and save it to file.
|
33
|
+
#
|
34
|
+
def fetch
|
35
|
+
resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
|
36
|
+
# return if there aren't any changes since last fetching
|
37
|
+
return unless resp.status == 200
|
38
|
+
|
39
|
+
data = YAML.safe_load resp.body
|
40
|
+
all_success = true
|
41
|
+
data["root"]["items"].each do |doc|
|
42
|
+
success = parse_page doc
|
43
|
+
all_success &&= success
|
44
|
+
end
|
45
|
+
self.etag = resp[:etag] if all_success
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
#
|
51
|
+
# Parse document and write it to file
|
52
|
+
#
|
53
|
+
# @param [Hash] doc
|
54
|
+
#
|
55
|
+
def parse_page(doc)
|
56
|
+
bib = Scrapper.parse_page doc
|
57
|
+
# bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
|
58
|
+
write_doc doc["docid"]["id"], bib
|
59
|
+
true
|
60
|
+
rescue StandardError => e
|
61
|
+
warn "Document: #{doc['docid']['id']}"
|
62
|
+
warn e.message
|
63
|
+
puts e.backtrace
|
64
|
+
false
|
65
|
+
end
|
66
|
+
|
67
|
+
def write_doc(docid, bib)
|
68
|
+
content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
|
69
|
+
file = File.join @output, "#{docid.downcase.gsub(%r{[/\s:]}, '_')}.#{@format}"
|
70
|
+
# if File.exist? file
|
71
|
+
# warn "#{file} exist"
|
72
|
+
# else
|
73
|
+
File.write file, content, encoding: "UTF-8"
|
74
|
+
# end
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# Read ETag from file
|
79
|
+
#
|
80
|
+
# @return [String, NilClass]
|
81
|
+
def etag
|
82
|
+
@etag ||= File.exist?(@etagfile) ? File.read(@etagfile, encoding: "UTF-8") : nil
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Save ETag to file
|
87
|
+
#
|
88
|
+
# @param tag [String]
|
89
|
+
def etag=(e_tag)
|
90
|
+
File.write @etagfile, e_tag, encoding: "UTF-8"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -4,11 +4,12 @@ module RelatonCalconnect
|
|
4
4
|
class Processor < Relaton::Processor
|
5
5
|
attr_reader :idtype
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
8
8
|
@short = :relaton_calconnect
|
9
9
|
@prefix = "CC"
|
10
10
|
@defaultprefix = %r{^CC\s}
|
11
11
|
@idtype = "CC"
|
12
|
+
@datasets = %w[calconnect-org]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
@@ -19,6 +20,18 @@ module RelatonCalconnect
|
|
19
20
|
::RelatonCalconnect::CcBibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the documents from a source
|
25
|
+
#
|
26
|
+
# @param [String] _source source name
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonCalconnect::CcBibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -1,17 +1,24 @@
|
|
1
1
|
module RelatonCalconnect
|
2
2
|
module Scrapper
|
3
3
|
DOMAIN = "https://standards.calconnect.org/".freeze
|
4
|
+
SCHEME, HOST = DOMAIN.split(%r{:?/?/})
|
4
5
|
# DOMAIN = "http://127.0.0.1:4000/".freeze
|
5
6
|
|
6
7
|
class << self
|
7
8
|
# papam hit [Hash]
|
8
9
|
# @return [RelatonOgc::OrcBibliographicItem]
|
9
10
|
def parse_page(hit)
|
10
|
-
|
11
|
+
links = array(hit["link"])
|
12
|
+
link = links.detect { |l| l["type"] == "rxl" }
|
11
13
|
if link
|
12
|
-
|
13
|
-
|
14
|
+
bib = fetch_bib_xml link["content"]
|
15
|
+
update_links bib, links
|
16
|
+
# XMLParser.from_xml bib_xml
|
17
|
+
else
|
18
|
+
bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
|
14
19
|
end
|
20
|
+
bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
|
21
|
+
bib
|
15
22
|
end
|
16
23
|
|
17
24
|
private
|
@@ -19,15 +26,25 @@ module RelatonCalconnect
|
|
19
26
|
# @param url [String]
|
20
27
|
# @return [String] XML
|
21
28
|
def fetch_bib_xml(url)
|
29
|
+
# rxl = get_rxl url
|
30
|
+
# uri_rxl = rxl.at("uri[@type='rxl']")
|
31
|
+
# return rxl.to_xml unless uri_rxl
|
32
|
+
|
33
|
+
# uri_xml = rxl.xpath("//uri").to_xml
|
34
|
+
# rxl = get_rxl uri_rxl.text
|
35
|
+
# docid = rxl.at "//docidentifier"
|
36
|
+
# docid.add_previous_sibling uri_xml
|
37
|
+
# rxl.to_xml
|
22
38
|
rxl = get_rxl url
|
23
39
|
uri_rxl = rxl.at("uri[@type='rxl']")
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
rxl.to_xml
|
40
|
+
if uri_rxl
|
41
|
+
uri_xml = rxl.xpath("//uri").to_xml
|
42
|
+
rxl = get_rxl uri_rxl.text
|
43
|
+
docid = rxl.at "//docidentifier"
|
44
|
+
docid.add_previous_sibling uri_xml
|
45
|
+
end
|
46
|
+
xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
|
47
|
+
RelatonCalconnect::XMLParser.from_xml xml
|
31
48
|
end
|
32
49
|
|
33
50
|
# @param path [String]
|
@@ -36,6 +53,44 @@ module RelatonCalconnect
|
|
36
53
|
resp = Faraday.get DOMAIN + path
|
37
54
|
Nokogiri::XML resp.body
|
38
55
|
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Fix editorial group
|
59
|
+
#
|
60
|
+
# @param [Hash] doc
|
61
|
+
#
|
62
|
+
# @return [Hash]
|
63
|
+
#
|
64
|
+
def doc_to_hash(doc)
|
65
|
+
array(doc["editorialgroup"]).each do |eg|
|
66
|
+
tc = eg.delete("technical_committee")
|
67
|
+
eg.merge!(tc) if tc
|
68
|
+
end
|
69
|
+
doc
|
70
|
+
end
|
71
|
+
|
72
|
+
def update_links(bib, links)
|
73
|
+
links.each do |l|
|
74
|
+
tu = l.transform_keys(&:to_sym)
|
75
|
+
bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"])
|
76
|
+
end
|
77
|
+
bib
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Wrap into Array if not Array
|
82
|
+
#
|
83
|
+
# @param [Array, Hash, String, nil] content
|
84
|
+
#
|
85
|
+
# @return [Array<Hash, String>]
|
86
|
+
#
|
87
|
+
def array(content)
|
88
|
+
case content
|
89
|
+
when Array then content
|
90
|
+
when nil then []
|
91
|
+
else [content]
|
92
|
+
end
|
93
|
+
end
|
39
94
|
end
|
40
95
|
end
|
41
96
|
end
|
data/lib/relaton_calconnect.rb
CHANGED
@@ -8,6 +8,7 @@ require "relaton_calconnect/technical_committee"
|
|
8
8
|
require "relaton_calconnect/cc_bibliographic_item"
|
9
9
|
require "relaton_calconnect/xml_parser"
|
10
10
|
require "relaton_calconnect/hash_converter"
|
11
|
+
require "relaton_calconnect/data_fetcher"
|
11
12
|
|
12
13
|
module RelatonCalconnect
|
13
14
|
class Error < StandardError; end
|
data/relaton_calconnect.gemspec
CHANGED
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
28
28
|
|
29
29
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
30
|
-
spec.add_development_dependency "rake", "~>
|
30
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.0"
|
32
32
|
spec.add_development_dependency "ruby-jing"
|
33
33
|
spec.add_development_dependency "simplecov"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-calconnect
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '13.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '13.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/relaton_calconnect.rb
|
164
164
|
- lib/relaton_calconnect/cc_bibliographic_item.rb
|
165
165
|
- lib/relaton_calconnect/cc_bibliography.rb
|
166
|
+
- lib/relaton_calconnect/data_fetcher.rb
|
166
167
|
- lib/relaton_calconnect/hash_converter.rb
|
167
168
|
- lib/relaton_calconnect/hit.rb
|
168
169
|
- lib/relaton_calconnect/hit_collection.rb
|