relaton-calconnect 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c3a78f7962d9b70c4dc6a30bd002f9973998f55d52af6612f63beb58e5109bf
4
- data.tar.gz: e0f8ac1a2c0c676dca4e1f3ebf0ae044a453a143824f953e860017777b01a587
3
+ metadata.gz: 276dc8eec3c63bb0992e5986afcf1c45264e4541c5f008697c51f758be583d94
4
+ data.tar.gz: 50512e178a70c19233a71a7ecfd081efa23128227e4ec9de73d4d270a8c1a1aa
5
5
  SHA512:
6
- metadata.gz: 66e83c6550539dcacb9583e9f90d67666b70ac9e39b8802a737741600b88ee3c1ca11c7cd9275acccc9ef023e9602a3495f573f75d76845070864f4335e64e8b
7
- data.tar.gz: c7ace3d658e7c003847a1cc46b11385cf868d9fc47812e57c46d8ce08a87f865aa61a63b3e5724cc4a9c4554ca90d014cc7cdd7627ed9d09444c07aeba168c19
6
+ metadata.gz: 3786fc11fc5a004f691d3f5994bd4289ab7f71116cfb0de5f5e3d2477a6460a7b3b4a7073b91bdd9b736f046bad68e824fa22cb983727589a89899d168419615
7
+ data.tar.gz: a0032955308db628a798ee1f9a45616e49e82799031ee205de6b73298a768fee8ef175835330567ef5709c8a2ba5d853ca0d10c865fdc793baf966c472bb158c
data/README.adoc CHANGED
@@ -106,6 +106,25 @@ RelatonCalconnect::CcBibliographicItem.from_hash hash
106
106
  ...
107
107
  ----
108
108
 
109
+ === Fetch data
110
+
111
+ This gem uses the https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml dataset as one of data sources.
112
+
113
+ The method `RelatonCalconnect::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
114
+ Arguments:
115
+
116
+ - `output` - folder to save documents (default './data').
117
+ - `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
118
+
119
+ [source,ruby]
120
+ ----
121
+ RelatonCalconnect::DataFetcher.fetch
122
+ Started at: 2021-09-09 16:03:51 +0200
123
+ Stopped at: 2021-09-09 16:04:12 +0200
124
+ Done in: 20 sec.
125
+ => nil
126
+ ----
127
+
109
128
  == Development
110
129
 
111
130
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal:true
2
+
3
+ module RelatonCalconnect
4
+ #
5
+ # Relaton-calconnect data fetcher
6
+ #
7
+ class DataFetcher
8
+ # DOMAIN = "https://standards.calconnect.org/"
9
+ # SCHEME, HOST = DOMAIN.split(%r{:?/?/})
10
+ ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
11
+ # DATADIR = "data"
12
+ # DATAFILE = File.join DATADIR, "bibliography.yml"
13
+ # ETAGFILE = File.join DATADIR, "etag.txt"
14
+
15
+ def initialize(output, format)
16
+ @output = output
17
+ @etagfile = File.join output, "etag.txt"
18
+ @format = format
19
+ end
20
+
21
+ def self.fetch(output: "data", format: "yaml")
22
+ t1 = Time.now
23
+ puts "Started at: #{t1}"
24
+ FileUtils.mkdir_p output unless Dir.exist? output
25
+ new(output, format).fetch
26
+ t2 = Time.now
27
+ puts "Stopped at: #{t2}"
28
+ puts "Done in: #{(t2 - t1).round} sec."
29
+ end
30
+
31
+ #
32
+ # fetch data form server and save it to file.
33
+ #
34
+ def fetch
35
+ resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
36
+ # return if there aren't any changes since last fetching
37
+ return unless resp.status == 200
38
+
39
+ data = YAML.safe_load resp.body
40
+ all_success = true
41
+ data["root"]["items"].each do |doc|
42
+ success = parse_page doc
43
+ all_success &&= success
44
+ end
45
+ self.etag = resp[:etag] if all_success
46
+ end
47
+
48
+ private
49
+
50
+ #
51
+ # Parse document and write it to file
52
+ #
53
+ # @param [Hash] doc
54
+ #
55
+ def parse_page(doc)
56
+ bib = Scrapper.parse_page doc
57
+ # bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
58
+ write_doc doc["docid"]["id"], bib
59
+ true
60
+ rescue StandardError => e
61
+ warn "Document: #{doc['docid']['id']}"
62
+ warn e.message
63
+ puts e.backtrace
64
+ false
65
+ end
66
+
67
+ def write_doc(docid, bib)
68
+ content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
69
+ file = File.join @output, "#{docid.downcase.gsub(%r{[/\s:]}, '_')}.#{@format}"
70
+ # if File.exist? file
71
+ # warn "#{file} exist"
72
+ # else
73
+ File.write file, content, encoding: "UTF-8"
74
+ # end
75
+ end
76
+
77
+ #
78
+ # Read ETag from file
79
+ #
80
+ # @return [String, NilClass]
81
+ def etag
82
+ @etag ||= File.exist?(@etagfile) ? File.read(@etagfile, encoding: "UTF-8") : nil
83
+ end
84
+
85
+ #
86
+ # Save ETag to file
87
+ #
88
+ # @param tag [String]
89
+ def etag=(e_tag)
90
+ File.write @etagfile, e_tag, encoding: "UTF-8"
91
+ end
92
+ end
93
+ end
@@ -4,11 +4,12 @@ module RelatonCalconnect
4
4
  class Processor < Relaton::Processor
5
5
  attr_reader :idtype
6
6
 
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @short = :relaton_calconnect
9
9
  @prefix = "CC"
10
10
  @defaultprefix = %r{^CC\s}
11
11
  @idtype = "CC"
12
+ @datasets = %w[calconnect-org]
12
13
  end
13
14
 
14
15
  # @param code [String]
@@ -19,6 +20,18 @@ module RelatonCalconnect
19
20
  ::RelatonCalconnect::CcBibliography.get(code, date, opts)
20
21
  end
21
22
 
23
+ #
24
+ # Fetch all the documents from a source
25
+ #
26
+ # @param [String] _source source name
27
+ # @param [Hash] opts
28
+ # @option opts [String] :output directory to output documents
29
+ # @option opts [String] :format
30
+ #
31
+ def fetch_data(_source, opts)
32
+ DataFetcher.fetch(**opts)
33
+ end
34
+
22
35
  # @param xml [String]
23
36
  # @return [RelatonCalconnect::CcBibliographicItem]
24
37
  def from_xml(xml)
@@ -1,17 +1,24 @@
1
1
  module RelatonCalconnect
2
2
  module Scrapper
3
3
  DOMAIN = "https://standards.calconnect.org/".freeze
4
+ SCHEME, HOST = DOMAIN.split(%r{:?/?/})
4
5
  # DOMAIN = "http://127.0.0.1:4000/".freeze
5
6
 
6
7
  class << self
7
8
  # papam hit [Hash]
8
9
  # @return [RelatonOgc::OrcBibliographicItem]
9
10
  def parse_page(hit)
10
- link = hit["link"].detect { |l| l["type"] == "rxl" }
11
+ links = array(hit["link"])
12
+ link = links.detect { |l| l["type"] == "rxl" }
11
13
  if link
12
- bib_xml = fetch_bib_xml link["content"]
13
- XMLParser.from_xml bib_xml
14
+ bib = fetch_bib_xml link["content"]
15
+ update_links bib, links
16
+ # XMLParser.from_xml bib_xml
17
+ else
18
+ bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
14
19
  end
20
+ bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
21
+ bib
15
22
  end
16
23
 
17
24
  private
@@ -19,15 +26,25 @@ module RelatonCalconnect
19
26
  # @param url [String]
20
27
  # @return [String] XML
21
28
  def fetch_bib_xml(url)
29
+ # rxl = get_rxl url
30
+ # uri_rxl = rxl.at("uri[@type='rxl']")
31
+ # return rxl.to_xml unless uri_rxl
32
+
33
+ # uri_xml = rxl.xpath("//uri").to_xml
34
+ # rxl = get_rxl uri_rxl.text
35
+ # docid = rxl.at "//docidentifier"
36
+ # docid.add_previous_sibling uri_xml
37
+ # rxl.to_xml
22
38
  rxl = get_rxl url
23
39
  uri_rxl = rxl.at("uri[@type='rxl']")
24
- return rxl.to_xml unless uri_rxl
25
-
26
- uri_xml = rxl.xpath("//uri").to_xml
27
- rxl = get_rxl uri_rxl.text
28
- docid = rxl.at "//docidentifier"
29
- docid.add_previous_sibling uri_xml
30
- rxl.to_xml
40
+ if uri_rxl
41
+ uri_xml = rxl.xpath("//uri").to_xml
42
+ rxl = get_rxl uri_rxl.text
43
+ docid = rxl.at "//docidentifier"
44
+ docid.add_previous_sibling uri_xml
45
+ end
46
+ xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
47
+ RelatonCalconnect::XMLParser.from_xml xml
31
48
  end
32
49
 
33
50
  # @param path [String]
@@ -36,6 +53,44 @@ module RelatonCalconnect
36
53
  resp = Faraday.get DOMAIN + path
37
54
  Nokogiri::XML resp.body
38
55
  end
56
+
57
+ #
58
+ # Fix editorial group
59
+ #
60
+ # @param [Hash] doc
61
+ #
62
+ # @return [Hash]
63
+ #
64
+ def doc_to_hash(doc)
65
+ array(doc["editorialgroup"]).each do |eg|
66
+ tc = eg.delete("technical_committee")
67
+ eg.merge!(tc) if tc
68
+ end
69
+ doc
70
+ end
71
+
72
+ def update_links(bib, links)
73
+ links.each do |l|
74
+ tu = l.transform_keys(&:to_sym)
75
+ bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"])
76
+ end
77
+ bib
78
+ end
79
+
80
+ #
81
+ # Wrap into Array if not Array
82
+ #
83
+ # @param [Array, Hash, String, nil] content
84
+ #
85
+ # @return [Array<Hash, String>]
86
+ #
87
+ def array(content)
88
+ case content
89
+ when Array then content
90
+ when nil then []
91
+ else [content]
92
+ end
93
+ end
39
94
  end
40
95
  end
41
96
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonCalconnect
2
- VERSION = "1.9.0".freeze
2
+ VERSION = "1.9.1".freeze
3
3
  end
@@ -8,6 +8,7 @@ require "relaton_calconnect/technical_committee"
8
8
  require "relaton_calconnect/cc_bibliographic_item"
9
9
  require "relaton_calconnect/xml_parser"
10
10
  require "relaton_calconnect/hash_converter"
11
+ require "relaton_calconnect/data_fetcher"
11
12
 
12
13
  module RelatonCalconnect
13
14
  class Error < StandardError; end
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
28
28
 
29
29
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
30
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "rake", "~> 13.0"
31
31
  spec.add_development_dependency "rspec", "~> 3.0"
32
32
  spec.add_development_dependency "ruby-jing"
33
33
  spec.add_development_dependency "simplecov"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-calconnect
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-26 00:00:00.000000000 Z
11
+ date: 2021-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '13.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '13.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -163,6 +163,7 @@ files:
163
163
  - lib/relaton_calconnect.rb
164
164
  - lib/relaton_calconnect/cc_bibliographic_item.rb
165
165
  - lib/relaton_calconnect/cc_bibliography.rb
166
+ - lib/relaton_calconnect/data_fetcher.rb
166
167
  - lib/relaton_calconnect/hash_converter.rb
167
168
  - lib/relaton_calconnect/hit.rb
168
169
  - lib/relaton_calconnect/hit_collection.rb