relaton-calconnect 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0645855bad704efa0a6ab4dafc414c3835b3815815144d8dcf1cc3f2a44d8abf'
4
- data.tar.gz: b76c2f1cce88c496b2888041017c4a5c2b1977e22a1945c46e67ed3684dc1d8b
3
+ metadata.gz: 0e564957ec130560aa31657ed12ccad46243f22127c41a03bee8829408e6a8af
4
+ data.tar.gz: 06c5cbc11caf9f1673328914c19e4730c5f1204ecc28bb4c9826b5f3f6e607a7
5
5
  SHA512:
6
- metadata.gz: 326da7cd637da6b0ed4e891b955a0c3975ae6489246d2648ac3fe5a2f1889ab87d36e7535a43845a29770bfc6adc6eaa3bca2a848fac8b5ceb633d762951f00b
7
- data.tar.gz: 51db972bff8e2038d5aa95476b41d201e27c7a94e7a3e5dff205fe71885507c09eef4fd672e09be19f8888faaff3b7fcf747e570fe6560239c851839548dccb1
6
+ metadata.gz: f5fb38e5bd6f32f9cef1c06093ffa5c5bf78453ae95606c3eaf1ddbda83a7b35a6cef6aabbe05c8c441d06abb027a11ae84bbf73d40f6a51fed3730396bfccd7
7
+ data.tar.gz: 3ef7d05cb914af53ac57530e4be9ca358ade82a1ef628d5c6922e721cda2aec992dd85b823e902c7ded6b14b34cc96646450e43df005aee0e209b7bc76f36800
@@ -1,3 +1,5 @@
1
+ require "mechanize"
2
+
1
3
  module Relaton::Calconnect
2
4
  class Bibliography
3
5
  class << self
@@ -5,7 +7,7 @@ module Relaton::Calconnect
5
7
  # @return [RelatonCalconnect::HitCollection]
6
8
  def search(text, year = nil, _opts = {})
7
9
  HitCollection.new text, year
8
- rescue Faraday::ConnectionFailed
10
+ rescue Mechanize::ResponseCodeError, SocketError, Errno::ECONNREFUSED
9
11
  raise Relaton::RequestError, "Could not access https://standards.calconnect.org"
10
12
  end
11
13
 
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal:true
2
2
 
3
- require "yaml"
4
- require "faraday"
3
+ require "json"
4
+ require "mechanize"
5
5
  require "relaton/core"
6
6
  require "relaton/index"
7
7
  require_relative "scraper"
@@ -12,12 +12,7 @@ module Relaton::Calconnect
12
12
  # Relaton-calconnect data fetcher
13
13
  #
14
14
  class DataFetcher < Relaton::Core::DataFetcher
15
- # DOMAIN = "https://standards.calconnect.org/"
16
- # SCHEME, HOST = DOMAIN.split(%r{:?/?/})
17
- ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
18
- # DATADIR = "data"
19
- # DATAFILE = File.join DATADIR, "bibliography.yml"
20
- # ETAGFILE = File.join DATADIR, "etag.txt"
15
+ ENDPOINT = "https://standards.calconnect.org/cc/index.json"
21
16
 
22
17
  def etagfile
23
18
  @etagfile ||= File.join @output, "etag.txt"
@@ -31,18 +26,23 @@ module Relaton::Calconnect
31
26
  Util.error msg
32
27
  end
33
28
 
29
+ def agent
30
+ @agent ||= Mechanize.new
31
+ end
32
+
34
33
  #
35
34
  # fetch data form server and save it to file.
36
35
  #
37
36
  def fetch(_source = nil) # rubocop:disable Metrics/AbcSize
38
- resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
39
- # return if there aren't any changes since last fetching
40
- return unless resp.status == 200
37
+ agent.request_headers["If-None-Match"] = etag if etag
38
+ resp = agent.get(ENDPOINT)
39
+ # 304 Not Modified — nothing changed since the last fetch
40
+ return if resp.code == "304"
41
41
 
42
- data = YAML.safe_load resp.body
42
+ data = JSON.parse resp.body
43
43
  all_success = true
44
- data["root"]["items"].each { |doc| all_success &&= parse_page doc }
45
- self.etag = resp[:etag] if all_success
44
+ Array(data["documents"]).each { |doc| all_success &&= parse_page doc }
45
+ self.etag = resp.response["etag"] if all_success
46
46
  index.save
47
47
  report_errors
48
48
  end
@@ -56,27 +56,33 @@ module Relaton::Calconnect
56
56
  #
57
57
  def parse_page(doc)
58
58
  bib = Scraper.new(@errors).parse_page doc
59
- # bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
60
- write_doc doc["docid"][0]["id"], bib
59
+ write_doc doc["id"], bib
61
60
  true
62
61
  rescue StandardError => e
63
- Util.warn "Document: #{doc['docid'][0]['id']}"
62
+ Util.warn "Document: #{doc['id']}"
64
63
  Util.warn e.message
65
64
  Util.warn e.backtrace[0..5].join("\n")
66
65
  false
67
66
  end
68
67
 
69
- def write_doc(docid, bib) # rubocop:disable Metrics/MethodLength
70
- file = output_file docid
68
+ def write_doc(slug, bib) # rubocop:disable Metrics/MethodLength
69
+ file = output_file slug
71
70
  if @files.include? file
72
71
  Util.warn "#{file} exist"
73
72
  else
74
73
  @files << file
75
74
  end
76
- index.add_or_update docid, file
75
+ index.add_or_update primary_docid(bib), file
77
76
  File.write file, serialize(bib), encoding: "UTF-8"
78
77
  end
79
78
 
79
+ # Index entries are keyed by the canonical doc identifier
80
+ # (e.g. "CC/DIR 10005:2019"), not the upstream slug used for filenames.
81
+ def primary_docid(bib)
82
+ docid = bib.docidentifier.find(&:primary) || bib.docidentifier.first
83
+ docid.content
84
+ end
85
+
80
86
  def to_yaml(bib) = bib.to_yaml
81
87
  def to_xml(bib) = bib.to_xml(bibdata: true)
82
88
  def to_bibxml(bib) = bib.to_rfcxml
@@ -1,13 +1,13 @@
1
+ require "mechanize"
2
+
1
3
  module Relaton::Calconnect
2
4
  class Hit < Relaton::Core::Hit
3
5
  # Parse page.
4
6
  # @return [Relaton::Calconnect::ItemData]
5
7
  def item
6
- # @fetch ||= Scraper.parse_page @hit
7
8
  @item ||= begin
8
9
  url = "#{HitCollection::GHURL}#{@hit[:file]}"
9
- resp = Faraday.get url
10
- Item.from_yaml resp.body
10
+ Item.from_yaml Mechanize.new.get(url).body
11
11
  end
12
12
  end
13
13
  end
@@ -1,4 +1,3 @@
1
- require "faraday"
2
1
  require "yaml"
3
2
  require "fileutils"
4
3
 
@@ -1,5 +1,8 @@
1
- require "addressable/uri"
1
+ require "mechanize"
2
+ require "stringio"
3
+ require "zip"
2
4
  require_relative "model/item"
5
+ require_relative "model/bibdata"
3
6
 
4
7
  module Relaton
5
8
  module Calconnect
@@ -7,9 +10,8 @@ module Relaton
7
10
  include Core::HashKeysSymbolizer
8
11
  include Core::ArrayWrapper
9
12
 
10
- DOMAIN = "https://standards.calconnect.org/".freeze
11
- SCHEME, HOST = DOMAIN.split(%r{:?/?/})
12
- # DOMAIN = "http://127.0.0.1:4000/".freeze
13
+ RELEASE_ASSET_URL = "https://github.com/%<owner>s/%<repo>s/releases/download/" \
14
+ "%<tag>s/%<asset_stem>s.zip".freeze
13
15
 
14
16
  # @param errors [Hash] error tracking hash
15
17
  def initialize(errors = {})
@@ -17,273 +19,68 @@ module Relaton
17
19
  end
18
20
 
19
21
  #
20
- # Parse document page
22
+ # Parse an aggregate-index document entry: download the per-document
23
+ # GitHub release zip, extract the RXL, and parse it into a bibitem.
21
24
  #
22
- # @papam hit [Hash] document hash
25
+ # @param hit [Hash] document entry from /cc/index.json
23
26
  #
24
27
  # @return [Relaton::Calconnect::ItemData] bibliographic item
25
28
  #
26
- def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
27
- hash = symbolize_hash_keys hit
28
- links = array(hash[:link])
29
- link = links.detect { |l| l[:type] == "rxl" }
30
- if link
31
- bib = fetch_bib_xml link[:content]
32
- update_links bib, links
33
- else
34
- hash.delete :fetched
35
- bib = hash_to_item hash
36
- end
37
- update_sources bib
38
- bib
39
- end
40
-
41
- private
42
-
43
- #
44
- # Fetch bibliographic item from XML source
45
- #
46
- # @param url [String] URL to fetch
47
- #
48
- # @return [RelatonCalconnect::CcBibliographicItem] bibliographic item
49
- #
50
- def fetch_bib_xml(url) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
51
- rxl = get_rxl url
52
- uri_rxl = rxl.at("uri[@type='rxl']")
53
- if uri_rxl
54
- uri_xml = rxl.xpath("//uri").to_xml
55
- rxl = get_rxl uri_rxl.text
56
- docid = rxl.at "//docidentifier"
57
- docid.add_previous_sibling uri_xml
58
- end
59
- xml = rxl.to_xml.gsub(%r{(</?)technical-committee(>)}, '\1committee\2')
60
- .gsub(%r{type="(?:csd|CC)"(?=>)}i, '\0 primary="true"')
29
+ def parse_page(hit)
30
+ zip_data = download_release_zip hit
31
+ rxl = extract_rxl zip_data, rxl_filename(hit)
32
+ xml = normalize_rxl rxl
61
33
  Item.from_xml xml
62
34
  end
63
35
 
64
- # @param path [String]
65
- # @return [Nokogiri::XML::Document]
66
- def get_rxl(path)
67
- resp = Faraday.get DOMAIN + path
68
- Nokogiri::XML resp.body
69
- end
70
-
71
- #
72
- # Fix editorial group
73
- #
74
- # @param [Hash] doc
75
- #
76
- # @return [Hash]
77
- #
78
- def hash_to_item(hash)
79
- hash_to_title hash
80
- hash_to_source hash
81
- hash_to_docid hash
82
- hash_to_date hash
83
- hash_to_contributor hash
84
- hash_to_edition hash
85
- hash_to_version hash
86
- hosh_to_abstract hash
87
- hash_to_status hash
88
- hash_to_relation hash
89
- hash_to_copyrigh hash
90
- hash_to_keyword hash
91
- hash_to_editorialgroup hash
92
- hash_to_ext hash
93
- ItemData.new(**hash)
94
- end
95
-
96
- def hash_to_title(hash)
97
- hash[:title] = array(hash[:title]).map do |t|
98
- t[:language] = t[:language].first if t[:language].is_a? Array
99
- t[:script] = t[:script].first if t[:script].is_a? Array
100
- t.delete :format
101
- Bib::Title.new(**t)
102
- end
103
- @errors[:title] &&= hash[:title].empty?
104
- end
105
-
106
- def hash_to_source(hash)
107
- hash[:source] = array(hash[:link]).map { |link| Bib::Uri.new(type: "src", **link) }
108
- @errors[:source] &&= hash[:source].empty?
109
- end
110
-
111
- def hash_to_docid(hash)
112
- docid = hash.delete(:docid)
113
- @errors[:docid] &&= docid.nil?
114
- return unless docid
115
-
116
- docid_types = %w[CC CSD]
117
- hash[:docidentifier] = array(docid).map do |id|
118
- id[:primary] = true if docid_types.include? id[:type].upcase
119
- id[:content] = id.delete(:id) if id[:id]
120
- Bib::Docidentifier.new(**id)
121
- end
122
- end
123
-
124
- def hash_to_date(hash)
125
- hash[:date] = array(hash[:date]).map do |d|
126
- d[:at] = d.delete(:value) if d[:value]
127
- Bib::Date.new(**d)
128
- end
129
- @errors[:date] &&= hash[:date].empty?
130
- end
131
-
132
- def hash_to_contributor(hash)
133
- hash[:contributor] = array(hash[:contributor]).map do |contrib|
134
- if contrib[:organization]
135
- contrib[:organization] = create_organization contrib[:organization]
136
- elsif contrib[:person]
137
- contrib[:person] = create_person contrib[:person]
138
- end
139
- contrib[:role] = array(contrib[:role]).map do |role|
140
- role[:description] = array(role[:description]).map do |desc|
141
- Bib::LocalizedMarkedUpString.new content: desc
142
- end
143
- Bib::Contributor::Role.new(**role)
144
- end
145
- Bib::Contributor.new(**contrib)
146
- end
147
- @errors[:contributor] &&= hash[:contributor].empty?
148
- end
149
-
150
- def create_organization(org_hash)
151
- org_name = array(org_hash[:name]).each { |name| Bib::TypedLocalizedString.new(**name) }
152
- contact = create_contact org_hash[:contact]
153
- Bib::Organization.new(name: org_name, **contact)
154
- end
155
-
156
- def create_contact(contact_hash)
157
- array(contact_hash).each_with_object({address: [], email: [], uri: []}) do |cont, acc|
158
- case cont
159
- in { address: addr_hash }
160
- acc[:address] = Bib::Address.new(**addr_hash)
161
- in { email: email }
162
- acc[:email] << email
163
- in { uri: uri }
164
- acc[:uri] << Bib::Uri.new(content: uri)
165
- end
166
- end
167
- end
168
-
169
- def create_person(person_hash)
170
- completename = Bib::LocalizedString.new(**person_hash[:name][:completename])
171
- name = Bib::FullName.new completename: completename
172
- affiliation = array(person_hash[:affiliation]).map do |aff|
173
- org = create_organization aff[:organization]
174
- Bib::Affiliation.new(organization: org)
175
- end
176
- contact = create_contact person_hash[:contact]
177
- Bib::Person.new(name: name, affiliation: affiliation, **contact)
178
- end
179
-
180
- def hash_to_edition(hash)
181
- number = hash.dig(:edition, :content)
182
- @errors[:edition] &&= number.nil?
183
- hash[:edition] = Bib::Edition.new(number: number) if number
184
- end
185
-
186
- def hash_to_version(hash)
187
- hash[:version] = array(hash[:version]).map do |ver|
188
- Bib::Version.new(revision_date: ver[:revision_date])
189
- end
190
- end
191
-
192
- def hosh_to_abstract(hash)
193
- hash[:abstract] = array(hash[:abstract]).map do |abs|
194
- Bib::Abstract.new(**abs)
195
- end
196
- @errors[:abstract] &&= hash[:abstract].empty?
197
- end
198
-
199
- def hash_to_status(hash)
200
- docstatus = hash.delete(:docstatus)
201
- @errors[:status] &&= docstatus.nil?
202
- return unless docstatus
203
-
204
- stage = Bib::Status::Stage.new content: docstatus.dig(:stage, :value)
205
- hash[:status] = Bib::Status.new stage: stage
206
- end
36
+ private
207
37
 
208
- def hash_to_relation(hash)
209
- hash[:relation] = array(hash[:relation]).map do |rel|
210
- Bib::Relation.new(type: rel[:type], bibitem: hash_to_item(rel[:bibitem]))
211
- end
212
- @errors[:relation] &&= hash[:relation].empty?
38
+ def release_zip_url(hit)
39
+ source = hit["source"] || {}
40
+ format(
41
+ RELEASE_ASSET_URL,
42
+ owner: source["owner"],
43
+ repo: source["repo"],
44
+ tag: source["tag"],
45
+ asset_stem: asset_stem(hit),
46
+ )
213
47
  end
214
48
 
215
- def hash_to_copyrigh(hash)
216
- hash[:copyright] = array(hash[:copyright]).map do |cr|
217
- cr[:owner] = array(cr[:owner]).map do |owner|
218
- org_name = array(owner[:name]).map do |name|
219
- Bib::TypedLocalizedString.new(**name)
220
- end
221
- Bib::ContributionInfo.new organization: Bib::Organization.new(name: org_name)
222
- end
223
- Bib::Copyright.new(**cr)
224
- end
225
- @errors[:copyright] &&= hash[:copyright].empty?
49
+ def rxl_filename(hit)
50
+ "#{asset_stem(hit)}.rxl"
226
51
  end
227
52
 
228
- def hash_to_keyword(hash)
229
- hash[:keyword] = array(hash[:keyword]).map do |kw|
230
- vocab = Bib::LocalizedString.new(**kw)
231
- Bib::Keyword.new(vocab: vocab)
232
- end
233
- @errors[:keyword] &&= hash[:keyword].empty?
53
+ # The release asset uses the tag with the slash replaced by a hyphen,
54
+ # which encodes both the document id and the release qualifier
55
+ # (e.g. `ed1`, `ed1-wd`).
56
+ def asset_stem(hit)
57
+ (hit["source"] && hit["source"]["tag"] || "").tr("/", "-")
234
58
  end
235
59
 
236
- def hash_to_ext(hash)
237
- return unless hash[:ext]
238
-
239
- hash_to_doctype hash[:ext]
240
- hash[:ext] = Ext.new(flavor: "calconnect", **hash.delete(:ext))
60
+ def download_release_zip(hit)
61
+ url = release_zip_url(hit)
62
+ agent.get(url).body
63
+ rescue Mechanize::ResponseCodeError => e
64
+ raise "Failed to download release zip #{url}: HTTP #{e.response_code}"
241
65
  end
242
66
 
243
- def hash_to_doctype(ext)
244
- @errors[:doctype] &&= ext[:doctype].nil?
245
- return unless ext[:doctype]
246
-
247
- ext[:doctype] = Doctype.new content: ext.dig(:doctype, :type), abbreviation: ext.dig(:doctype, :abbreviation)
67
+ def agent
68
+ @agent ||= Mechanize.new
248
69
  end
249
70
 
250
- def hash_to_editorialgroup(hash)
251
- eg = hash.delete(:editorialgroup) || (hash[:ext] && hash[:ext].delete(:editorialgroup))
252
- @errors[:editorialgroup] &&= eg.nil?
253
- return unless eg
71
+ def extract_rxl(zip_data, filename)
72
+ Zip::File.open_buffer(StringIO.new(zip_data)) do |zip|
73
+ entry = zip.find_entry(filename)
74
+ raise "RXL file #{filename} not found in release zip" unless entry
254
75
 
255
- # Normalize: editorialgroup can be a single hash or an array of hashes
256
- groups = array(eg).map do |g|
257
- g = g[:technical_committee] if g.is_a?(Hash) && g[:technical_committee]
258
- g
76
+ return entry.get_input_stream.read
259
77
  end
260
-
261
- subdivisions = groups.map do |g|
262
- subdiv_name = Bib::TypedLocalizedString.new content: g[:name]
263
- Bib::Subdivision.new(type: "technical-committee", name: [subdiv_name])
264
- end
265
-
266
- org_name = Bib::TypedLocalizedString.new content: "CalConnect"
267
- org = Bib::Organization.new name: [org_name], subdivision: subdivisions
268
- description = Bib::LocalizedMarkedUpString.new content: "committee"
269
- role = Bib::Contributor::Role.new type: "author", description: [description]
270
- hash[:contributor] ||= []
271
- hash[:contributor] << Bib::Contributor.new(organization: org, role: [role])
272
- end
273
-
274
- def update_links(bib, links)
275
- links.each do |l|
276
- tu = l.transform_keys(&:to_sym)
277
- bib.source << Relaton::Bib::Uri.new(**tu) unless bib.source(l[:type])
278
- end
279
- bib
280
78
  end
281
79
 
282
- def update_sources(bib)
283
- bib.source.each do |l|
284
- uri = Addressable::URI.parse l.content
285
- l.content = uri.merge(scheme: SCHEME, host: HOST).to_s unless uri.host
286
- end
80
+ def normalize_rxl(xml)
81
+ xml.gsub(%r{(</?)technical-committee(>)}, '\1committee\2')
82
+ .gsub(%r{type="(?:csd|CC)"(?=>)}i, '\0 primary="true"')
83
+ .gsub(%r{type="Technical committee"}, 'type="technical-committee"')
287
84
  end
288
85
  end
289
86
  end
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module Calconnect
3
- VERSION = "2.1.1".freeze
3
+ VERSION = "2.1.2".freeze
4
4
  end
5
5
  end
@@ -26,9 +26,10 @@ Gem::Specification.new do |spec|
26
26
  spec.require_paths = ["lib"]
27
27
  spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
28
28
 
29
- spec.add_dependency "faraday", "~> 2.7.0"
29
+ spec.add_dependency "mechanize", "~> 2.10"
30
30
  spec.add_dependency "relaton-bib", "~> 2.1.0"
31
31
  spec.add_dependency "relaton-core", "~> 0.0.12"
32
32
  spec.add_dependency "addressable", "~> 2.8"
33
33
  spec.add_dependency "relaton-index", "~> 0.2.0"
34
+ spec.add_dependency "rubyzip", "~> 2.3"
34
35
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-calconnect
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: faraday
14
+ name: mechanize
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.7.0
19
+ version: '2.10'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.7.0
26
+ version: '2.10'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: relaton-bib
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.2.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubyzip
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.3'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.3'
83
97
  description: 'Relaton::Calconnect: retrieve CC Standards for bibliographic use using
84
98
  the BibliographicItem model'
85
99
  email: