relaton-calconnect 1.9.0 → 1.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c3a78f7962d9b70c4dc6a30bd002f9973998f55d52af6612f63beb58e5109bf
4
- data.tar.gz: e0f8ac1a2c0c676dca4e1f3ebf0ae044a453a143824f953e860017777b01a587
3
+ metadata.gz: 276dc8eec3c63bb0992e5986afcf1c45264e4541c5f008697c51f758be583d94
4
+ data.tar.gz: 50512e178a70c19233a71a7ecfd081efa23128227e4ec9de73d4d270a8c1a1aa
5
5
  SHA512:
6
- metadata.gz: 66e83c6550539dcacb9583e9f90d67666b70ac9e39b8802a737741600b88ee3c1ca11c7cd9275acccc9ef023e9602a3495f573f75d76845070864f4335e64e8b
7
- data.tar.gz: c7ace3d658e7c003847a1cc46b11385cf868d9fc47812e57c46d8ce08a87f865aa61a63b3e5724cc4a9c4554ca90d014cc7cdd7627ed9d09444c07aeba168c19
6
+ metadata.gz: 3786fc11fc5a004f691d3f5994bd4289ab7f71116cfb0de5f5e3d2477a6460a7b3b4a7073b91bdd9b736f046bad68e824fa22cb983727589a89899d168419615
7
+ data.tar.gz: a0032955308db628a798ee1f9a45616e49e82799031ee205de6b73298a768fee8ef175835330567ef5709c8a2ba5d853ca0d10c865fdc793baf966c472bb158c
data/README.adoc CHANGED
@@ -106,6 +106,25 @@ RelatonCalconnect::CcBibliographicItem.from_hash hash
106
106
  ...
107
107
  ----
108
108
 
109
+ === Fetch data
110
+
111
+ This gem uses the https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml dataset as one of data sources.
112
+
113
+ The method `RelatonCalconnect::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
114
+ Arguments:
115
+
116
+ - `output` - folder to save documents (default './data').
117
+ - `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
118
+
119
+ [source,ruby]
120
+ ----
121
+ RelatonCalconnect::DataFetcher.fetch
122
+ Started at: 2021-09-09 16:03:51 +0200
123
+ Stopped at: 2021-09-09 16:04:12 +0200
124
+ Done in: 20 sec.
125
+ => nil
126
+ ----
127
+
109
128
  == Development
110
129
 
111
130
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal:true
2
+
3
+ module RelatonCalconnect
4
+ #
5
+ # Relaton-calconnect data fetcher
6
+ #
7
+ class DataFetcher
8
+ # DOMAIN = "https://standards.calconnect.org/"
9
+ # SCHEME, HOST = DOMAIN.split(%r{:?/?/})
10
+ ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
11
+ # DATADIR = "data"
12
+ # DATAFILE = File.join DATADIR, "bibliography.yml"
13
+ # ETAGFILE = File.join DATADIR, "etag.txt"
14
+
15
+ def initialize(output, format)
16
+ @output = output
17
+ @etagfile = File.join output, "etag.txt"
18
+ @format = format
19
+ end
20
+
21
+ def self.fetch(output: "data", format: "yaml")
22
+ t1 = Time.now
23
+ puts "Started at: #{t1}"
24
+ FileUtils.mkdir_p output unless Dir.exist? output
25
+ new(output, format).fetch
26
+ t2 = Time.now
27
+ puts "Stopped at: #{t2}"
28
+ puts "Done in: #{(t2 - t1).round} sec."
29
+ end
30
+
31
+ #
32
+ # fetch data form server and save it to file.
33
+ #
34
+ def fetch
35
+ resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
36
+ # return if there aren't any changes since last fetching
37
+ return unless resp.status == 200
38
+
39
+ data = YAML.safe_load resp.body
40
+ all_success = true
41
+ data["root"]["items"].each do |doc|
42
+ success = parse_page doc
43
+ all_success &&= success
44
+ end
45
+ self.etag = resp[:etag] if all_success
46
+ end
47
+
48
+ private
49
+
50
+ #
51
+ # Parse document and write it to file
52
+ #
53
+ # @param [Hash] doc
54
+ #
55
+ def parse_page(doc)
56
+ bib = Scrapper.parse_page doc
57
+ # bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
58
+ write_doc doc["docid"]["id"], bib
59
+ true
60
+ rescue StandardError => e
61
+ warn "Document: #{doc['docid']['id']}"
62
+ warn e.message
63
+ puts e.backtrace
64
+ false
65
+ end
66
+
67
+ def write_doc(docid, bib)
68
+ content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
69
+ file = File.join @output, "#{docid.downcase.gsub(%r{[/\s:]}, '_')}.#{@format}"
70
+ # if File.exist? file
71
+ # warn "#{file} exist"
72
+ # else
73
+ File.write file, content, encoding: "UTF-8"
74
+ # end
75
+ end
76
+
77
+ #
78
+ # Read ETag from file
79
+ #
80
+ # @return [String, NilClass]
81
+ def etag
82
+ @etag ||= File.exist?(@etagfile) ? File.read(@etagfile, encoding: "UTF-8") : nil
83
+ end
84
+
85
+ #
86
+ # Save ETag to file
87
+ #
88
+ # @param tag [String]
89
+ def etag=(e_tag)
90
+ File.write @etagfile, e_tag, encoding: "UTF-8"
91
+ end
92
+ end
93
+ end
@@ -4,11 +4,12 @@ module RelatonCalconnect
4
4
  class Processor < Relaton::Processor
5
5
  attr_reader :idtype
6
6
 
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @short = :relaton_calconnect
9
9
  @prefix = "CC"
10
10
  @defaultprefix = %r{^CC\s}
11
11
  @idtype = "CC"
12
+ @datasets = %w[calconnect-org]
12
13
  end
13
14
 
14
15
  # @param code [String]
@@ -19,6 +20,18 @@ module RelatonCalconnect
19
20
  ::RelatonCalconnect::CcBibliography.get(code, date, opts)
20
21
  end
21
22
 
23
+ #
24
+ # Fetch all the documents from a source
25
+ #
26
+ # @param [String] _source source name
27
+ # @param [Hash] opts
28
+ # @option opts [String] :output directory to output documents
29
+ # @option opts [String] :format
30
+ #
31
+ def fetch_data(_source, opts)
32
+ DataFetcher.fetch(**opts)
33
+ end
34
+
22
35
  # @param xml [String]
23
36
  # @return [RelatonCalconnect::CcBibliographicItem]
24
37
  def from_xml(xml)
@@ -1,17 +1,24 @@
1
1
  module RelatonCalconnect
2
2
  module Scrapper
3
3
  DOMAIN = "https://standards.calconnect.org/".freeze
4
+ SCHEME, HOST = DOMAIN.split(%r{:?/?/})
4
5
  # DOMAIN = "http://127.0.0.1:4000/".freeze
5
6
 
6
7
  class << self
7
8
  # papam hit [Hash]
8
9
  # @return [RelatonOgc::OrcBibliographicItem]
9
10
  def parse_page(hit)
10
- link = hit["link"].detect { |l| l["type"] == "rxl" }
11
+ links = array(hit["link"])
12
+ link = links.detect { |l| l["type"] == "rxl" }
11
13
  if link
12
- bib_xml = fetch_bib_xml link["content"]
13
- XMLParser.from_xml bib_xml
14
+ bib = fetch_bib_xml link["content"]
15
+ update_links bib, links
16
+ # XMLParser.from_xml bib_xml
17
+ else
18
+ bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
14
19
  end
20
+ bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
21
+ bib
15
22
  end
16
23
 
17
24
  private
@@ -19,15 +26,25 @@ module RelatonCalconnect
19
26
  # @param url [String]
20
27
  # @return [String] XML
21
28
  def fetch_bib_xml(url)
29
+ # rxl = get_rxl url
30
+ # uri_rxl = rxl.at("uri[@type='rxl']")
31
+ # return rxl.to_xml unless uri_rxl
32
+
33
+ # uri_xml = rxl.xpath("//uri").to_xml
34
+ # rxl = get_rxl uri_rxl.text
35
+ # docid = rxl.at "//docidentifier"
36
+ # docid.add_previous_sibling uri_xml
37
+ # rxl.to_xml
22
38
  rxl = get_rxl url
23
39
  uri_rxl = rxl.at("uri[@type='rxl']")
24
- return rxl.to_xml unless uri_rxl
25
-
26
- uri_xml = rxl.xpath("//uri").to_xml
27
- rxl = get_rxl uri_rxl.text
28
- docid = rxl.at "//docidentifier"
29
- docid.add_previous_sibling uri_xml
30
- rxl.to_xml
40
+ if uri_rxl
41
+ uri_xml = rxl.xpath("//uri").to_xml
42
+ rxl = get_rxl uri_rxl.text
43
+ docid = rxl.at "//docidentifier"
44
+ docid.add_previous_sibling uri_xml
45
+ end
46
+ xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
47
+ RelatonCalconnect::XMLParser.from_xml xml
31
48
  end
32
49
 
33
50
  # @param path [String]
@@ -36,6 +53,44 @@ module RelatonCalconnect
36
53
  resp = Faraday.get DOMAIN + path
37
54
  Nokogiri::XML resp.body
38
55
  end
56
+
57
+ #
58
+ # Fix editorial group
59
+ #
60
+ # @param [Hash] doc
61
+ #
62
+ # @return [Hash]
63
+ #
64
+ def doc_to_hash(doc)
65
+ array(doc["editorialgroup"]).each do |eg|
66
+ tc = eg.delete("technical_committee")
67
+ eg.merge!(tc) if tc
68
+ end
69
+ doc
70
+ end
71
+
72
+ def update_links(bib, links)
73
+ links.each do |l|
74
+ tu = l.transform_keys(&:to_sym)
75
+ bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"])
76
+ end
77
+ bib
78
+ end
79
+
80
+ #
81
+ # Wrap into Array if not Array
82
+ #
83
+ # @param [Array, Hash, String, nil] content
84
+ #
85
+ # @return [Array<Hash, String>]
86
+ #
87
+ def array(content)
88
+ case content
89
+ when Array then content
90
+ when nil then []
91
+ else [content]
92
+ end
93
+ end
39
94
  end
40
95
  end
41
96
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonCalconnect
2
- VERSION = "1.9.0".freeze
2
+ VERSION = "1.9.1".freeze
3
3
  end
@@ -8,6 +8,7 @@ require "relaton_calconnect/technical_committee"
8
8
  require "relaton_calconnect/cc_bibliographic_item"
9
9
  require "relaton_calconnect/xml_parser"
10
10
  require "relaton_calconnect/hash_converter"
11
+ require "relaton_calconnect/data_fetcher"
11
12
 
12
13
  module RelatonCalconnect
13
14
  class Error < StandardError; end
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
28
28
 
29
29
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
30
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "rake", "~> 13.0"
31
31
  spec.add_development_dependency "rspec", "~> 3.0"
32
32
  spec.add_development_dependency "ruby-jing"
33
33
  spec.add_development_dependency "simplecov"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-calconnect
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-26 00:00:00.000000000 Z
11
+ date: 2021-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '13.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '13.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -163,6 +163,7 @@ files:
163
163
  - lib/relaton_calconnect.rb
164
164
  - lib/relaton_calconnect/cc_bibliographic_item.rb
165
165
  - lib/relaton_calconnect/cc_bibliography.rb
166
+ - lib/relaton_calconnect/data_fetcher.rb
166
167
  - lib/relaton_calconnect/hash_converter.rb
167
168
  - lib/relaton_calconnect/hit.rb
168
169
  - lib/relaton_calconnect/hit_collection.rb