RubyGems - relaton-calconnect - Versions diffs - 2.1.1 → 2.1.2 - Mend

relaton-calconnect 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/relaton/calconnect/bibliography.rb +3 -1
data/lib/relaton/calconnect/data_fetcher.rb +26 -20
data/lib/relaton/calconnect/hit.rb +3 -3
data/lib/relaton/calconnect/hit_collection.rb +0 -1
data/lib/relaton/calconnect/scraper.rb +46 -249
data/lib/relaton/calconnect/version.rb +1 -1
data/relaton-calconnect.gemspec +2 -1
metadata +19 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '0645855bad704efa0a6ab4dafc414c3835b3815815144d8dcf1cc3f2a44d8abf'
-  data.tar.gz: b76c2f1cce88c496b2888041017c4a5c2b1977e22a1945c46e67ed3684dc1d8b
+  metadata.gz: 0e564957ec130560aa31657ed12ccad46243f22127c41a03bee8829408e6a8af
+  data.tar.gz: 06c5cbc11caf9f1673328914c19e4730c5f1204ecc28bb4c9826b5f3f6e607a7
 SHA512:
-  metadata.gz: 326da7cd637da6b0ed4e891b955a0c3975ae6489246d2648ac3fe5a2f1889ab87d36e7535a43845a29770bfc6adc6eaa3bca2a848fac8b5ceb633d762951f00b
-  data.tar.gz: 51db972bff8e2038d5aa95476b41d201e27c7a94e7a3e5dff205fe71885507c09eef4fd672e09be19f8888faaff3b7fcf747e570fe6560239c851839548dccb1
+  metadata.gz: f5fb38e5bd6f32f9cef1c06093ffa5c5bf78453ae95606c3eaf1ddbda83a7b35a6cef6aabbe05c8c441d06abb027a11ae84bbf73d40f6a51fed3730396bfccd7
+  data.tar.gz: 3ef7d05cb914af53ac57530e4be9ca358ade82a1ef628d5c6922e721cda2aec992dd85b823e902c7ded6b14b34cc96646450e43df005aee0e209b7bc76f36800

data/lib/relaton/calconnect/bibliography.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require "mechanize"
 module Relaton::Calconnect
   class Bibliography
     class << self
@@ -5,7 +7,7 @@ module Relaton::Calconnect
       # @return [RelatonCalconnect::HitCollection]
       def search(text, year = nil, _opts = {})
         HitCollection.new text, year
-      rescue Faraday::ConnectionFailed
+      rescue Mechanize::ResponseCodeError, SocketError, Errno::ECONNREFUSED
         raise Relaton::RequestError, "Could not access https://standards.calconnect.org"
       end

data/lib/relaton/calconnect/data_fetcher.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal:true
-require "yaml"
-require "faraday"
+require "json"
+require "mechanize"
 require "relaton/core"
 require "relaton/index"
 require_relative "scraper"
@@ -12,12 +12,7 @@ module Relaton::Calconnect
   # Relaton-calconnect data fetcher
   #
   class DataFetcher < Relaton::Core::DataFetcher
-    # DOMAIN = "https://standards.calconnect.org/"
-    # SCHEME, HOST = DOMAIN.split(%r{:?/?/})
-    ENDPOINT = "https://standards.calconnect.org/relaton/index.yaml"
-    # DATADIR = "data"
-    # DATAFILE = File.join DATADIR, "bibliography.yml"
-    # ETAGFILE = File.join DATADIR, "etag.txt"
+    ENDPOINT = "https://standards.calconnect.org/cc/index.json"
     def etagfile
       @etagfile ||= File.join @output, "etag.txt"
@@ -31,18 +26,23 @@ module Relaton::Calconnect
       Util.error msg
     end
+    def agent
+      @agent ||= Mechanize.new
+    end
     #
     # fetch data form server and save it to file.
     #
     def fetch(_source = nil) # rubocop:disable Metrics/AbcSize
-      resp = Faraday.new(ENDPOINT, headers: { "If-None-Match" => etag }).get
-      # return if there aren't any changes since last fetching
-      return unless resp.status == 200
+      agent.request_headers["If-None-Match"] = etag if etag
+      resp = agent.get(ENDPOINT)
+      # 304 Not Modified — nothing changed since the last fetch
+      return if resp.code == "304"
-      data = YAML.safe_load resp.body
+      data = JSON.parse resp.body
       all_success = true
-      data["root"]["items"].each { |doc| all_success &&= parse_page doc }
-      self.etag = resp[:etag] if all_success
+      Array(data["documents"]).each { |doc| all_success &&= parse_page doc }
+      self.etag = resp.response["etag"] if all_success
       index.save
       report_errors
     end
@@ -56,27 +56,33 @@ module Relaton::Calconnect
     #
     def parse_page(doc)
       bib = Scraper.new(@errors).parse_page doc
-      # bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
-      write_doc doc["docid"][0]["id"], bib
+      write_doc doc["id"], bib
       true
     rescue StandardError => e
-      Util.warn "Document: #{doc['docid'][0]['id']}"
+      Util.warn "Document: #{doc['id']}"
       Util.warn e.message
       Util.warn e.backtrace[0..5].join("\n")
       false
     end
-    def write_doc(docid, bib) # rubocop:disable Metrics/MethodLength
-      file = output_file docid
+    def write_doc(slug, bib) # rubocop:disable Metrics/MethodLength
+      file = output_file slug
       if @files.include? file
         Util.warn "#{file} exist"
       else
         @files << file
       end
-      index.add_or_update docid, file
+      index.add_or_update primary_docid(bib), file
       File.write file, serialize(bib), encoding: "UTF-8"
     end
+    # Index entries are keyed by the canonical doc identifier
+    # (e.g. "CC/DIR 10005:2019"), not the upstream slug used for filenames.
+    def primary_docid(bib)
+      docid = bib.docidentifier.find(&:primary) || bib.docidentifier.first
+      docid.content
+    end
     def to_yaml(bib) = bib.to_yaml
     def to_xml(bib) = bib.to_xml(bibdata: true)
     def to_bibxml(bib) = bib.to_rfcxml

data/lib/relaton/calconnect/hit.rb CHANGED Viewed

@@ -1,13 +1,13 @@
+require "mechanize"
 module Relaton::Calconnect
   class Hit < Relaton::Core::Hit
     # Parse page.
     # @return [Relaton::Calconnect::ItemData]
     def item
-      # @fetch ||= Scraper.parse_page @hit
       @item ||= begin
         url = "#{HitCollection::GHURL}#{@hit[:file]}"
-        resp = Faraday.get url
-        Item.from_yaml resp.body
+        Item.from_yaml Mechanize.new.get(url).body
       end
     end
   end

data/lib/relaton/calconnect/hit_collection.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require "faraday"
 require "yaml"
 require "fileutils"

data/lib/relaton/calconnect/scraper.rb CHANGED Viewed

@@ -1,5 +1,8 @@
-require "addressable/uri"
+require "mechanize"
+require "stringio"
+require "zip"
 require_relative "model/item"
+require_relative "model/bibdata"
 module Relaton
   module Calconnect
@@ -7,9 +10,8 @@ module Relaton
       include Core::HashKeysSymbolizer
       include Core::ArrayWrapper
-      DOMAIN = "https://standards.calconnect.org/".freeze
-      SCHEME, HOST = DOMAIN.split(%r{:?/?/})
-      # DOMAIN = "http://127.0.0.1:4000/".freeze
+      RELEASE_ASSET_URL = "https://github.com/%<owner>s/%<repo>s/releases/download/" \
+                          "%<tag>s/%<asset_stem>s.zip".freeze
       # @param errors [Hash] error tracking hash
       def initialize(errors = {})
@@ -17,273 +19,68 @@ module Relaton
       end
       #
-      # Parse document page
+      # Parse an aggregate-index document entry: download the per-document
+      # GitHub release zip, extract the RXL, and parse it into a bibitem.
       #
-      # @papam hit [Hash] document hash
+      # @param hit [Hash] document entry from /cc/index.json
       #
       # @return [Relaton::Calconnect::ItemData] bibliographic item
       #
-      def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-        hash = symbolize_hash_keys hit
-        links = array(hash[:link])
-        link = links.detect { |l| l[:type] == "rxl" }
-        if link
-          bib = fetch_bib_xml link[:content]
-          update_links bib, links
-        else
-          hash.delete :fetched
-          bib = hash_to_item hash
-        end
-        update_sources bib
-        bib
-      end
-      private
-      #
-      # Fetch bibliographic item from XML source
-      #
-      # @param url [String] URL to fetch
-      #
-      # @return [RelatonCalconnect::CcBibliographicItem] bibliographic item
-      #
-      def fetch_bib_xml(url) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-        rxl = get_rxl url
-        uri_rxl = rxl.at("uri[@type='rxl']")
-        if uri_rxl
-          uri_xml = rxl.xpath("//uri").to_xml
-          rxl = get_rxl uri_rxl.text
-          docid = rxl.at "//docidentifier"
-          docid.add_previous_sibling uri_xml
-        end
-        xml = rxl.to_xml.gsub(%r{(</?)technical-committee(>)}, '\1committee\2')
-          .gsub(%r{type="(?:csd|CC)"(?=>)}i, '\0 primary="true"')
+      def parse_page(hit)
+        zip_data = download_release_zip hit
+        rxl = extract_rxl zip_data, rxl_filename(hit)
+        xml = normalize_rxl rxl
         Item.from_xml xml
       end
-      # @param path [String]
-      # @return [Nokogiri::XML::Document]
-      def get_rxl(path)
-        resp = Faraday.get DOMAIN + path
-        Nokogiri::XML resp.body
-      end
-      #
-      # Fix editorial group
-      #
-      # @param [Hash] doc
-      #
-      # @return [Hash]
-      #
-      def hash_to_item(hash)
-        hash_to_title hash
-        hash_to_source hash
-        hash_to_docid hash
-        hash_to_date hash
-        hash_to_contributor hash
-        hash_to_edition hash
-        hash_to_version hash
-        hosh_to_abstract hash
-        hash_to_status hash
-        hash_to_relation hash
-        hash_to_copyrigh hash
-        hash_to_keyword hash
-        hash_to_editorialgroup hash
-        hash_to_ext hash
-        ItemData.new(**hash)
-      end
-      def hash_to_title(hash)
-        hash[:title] = array(hash[:title]).map do |t|
-          t[:language] = t[:language].first if t[:language].is_a? Array
-          t[:script] = t[:script].first if t[:script].is_a? Array
-          t.delete :format
-          Bib::Title.new(**t)
-        end
-        @errors[:title] &&= hash[:title].empty?
-      end
-      def hash_to_source(hash)
-        hash[:source] = array(hash[:link]).map { |link| Bib::Uri.new(type: "src", **link) }
-        @errors[:source] &&= hash[:source].empty?
-      end
-      def hash_to_docid(hash)
-        docid = hash.delete(:docid)
-        @errors[:docid] &&= docid.nil?
-        return unless docid
-        docid_types = %w[CC CSD]
-        hash[:docidentifier] = array(docid).map do |id|
-          id[:primary] = true if docid_types.include? id[:type].upcase
-          id[:content] = id.delete(:id) if id[:id]
-          Bib::Docidentifier.new(**id)
-        end
-      end
-      def hash_to_date(hash)
-        hash[:date] = array(hash[:date]).map do |d|
-          d[:at] = d.delete(:value) if d[:value]
-          Bib::Date.new(**d)
-        end
-        @errors[:date] &&= hash[:date].empty?
-      end
-      def hash_to_contributor(hash)
-        hash[:contributor] = array(hash[:contributor]).map do |contrib|
-          if contrib[:organization]
-            contrib[:organization] = create_organization contrib[:organization]
-          elsif contrib[:person]
-            contrib[:person] = create_person contrib[:person]
-          end
-          contrib[:role] = array(contrib[:role]).map do |role|
-            role[:description] = array(role[:description]).map do |desc|
-              Bib::LocalizedMarkedUpString.new content: desc
-            end
-            Bib::Contributor::Role.new(**role)
-          end
-          Bib::Contributor.new(**contrib)
-        end
-        @errors[:contributor] &&= hash[:contributor].empty?
-      end
-      def create_organization(org_hash)
-        org_name = array(org_hash[:name]).each { |name| Bib::TypedLocalizedString.new(**name) }
-        contact = create_contact org_hash[:contact]
-        Bib::Organization.new(name: org_name, **contact)
-      end
-      def create_contact(contact_hash)
-        array(contact_hash).each_with_object({address: [], email: [], uri: []}) do |cont, acc|
-          case cont
-          in { address: addr_hash }
-            acc[:address] = Bib::Address.new(**addr_hash)
-          in { email: email }
-            acc[:email] << email
-          in { uri: uri }
-            acc[:uri] << Bib::Uri.new(content: uri)
-          end
-        end
-      end
-      def create_person(person_hash)
-        completename = Bib::LocalizedString.new(**person_hash[:name][:completename])
-        name = Bib::FullName.new completename: completename
-        affiliation = array(person_hash[:affiliation]).map do |aff|
-          org = create_organization aff[:organization]
-          Bib::Affiliation.new(organization: org)
-        end
-        contact = create_contact person_hash[:contact]
-        Bib::Person.new(name: name, affiliation: affiliation, **contact)
-      end
-      def hash_to_edition(hash)
-        number = hash.dig(:edition, :content)
-        @errors[:edition] &&= number.nil?
-        hash[:edition] = Bib::Edition.new(number: number) if number
-      end
-      def hash_to_version(hash)
-        hash[:version] = array(hash[:version]).map do |ver|
-          Bib::Version.new(revision_date: ver[:revision_date])
-        end
-      end
-      def hosh_to_abstract(hash)
-        hash[:abstract] = array(hash[:abstract]).map do |abs|
-          Bib::Abstract.new(**abs)
-        end
-        @errors[:abstract] &&= hash[:abstract].empty?
-      end
-      def hash_to_status(hash)
-        docstatus = hash.delete(:docstatus)
-        @errors[:status] &&= docstatus.nil?
-        return unless docstatus
-        stage = Bib::Status::Stage.new content: docstatus.dig(:stage, :value)
-        hash[:status] = Bib::Status.new stage: stage
-      end
+      private
-      def hash_to_relation(hash)
-        hash[:relation] = array(hash[:relation]).map do |rel|
-          Bib::Relation.new(type: rel[:type], bibitem: hash_to_item(rel[:bibitem]))
-        end
-        @errors[:relation] &&= hash[:relation].empty?
+      def release_zip_url(hit)
+        source = hit["source"] || {}
+        format(
+          RELEASE_ASSET_URL,
+          owner: source["owner"],
+          repo: source["repo"],
+          tag: source["tag"],
+          asset_stem: asset_stem(hit),
+        )
       end
-      def hash_to_copyrigh(hash)
-        hash[:copyright] = array(hash[:copyright]).map do |cr|
-          cr[:owner] = array(cr[:owner]).map do |owner|
-            org_name = array(owner[:name]).map do |name|
-              Bib::TypedLocalizedString.new(**name)
-            end
-            Bib::ContributionInfo.new organization: Bib::Organization.new(name: org_name)
-          end
-          Bib::Copyright.new(**cr)
-        end
-        @errors[:copyright] &&= hash[:copyright].empty?
+      def rxl_filename(hit)
+        "#{asset_stem(hit)}.rxl"
       end
-      def hash_to_keyword(hash)
-        hash[:keyword] = array(hash[:keyword]).map do |kw|
-          vocab = Bib::LocalizedString.new(**kw)
-          Bib::Keyword.new(vocab: vocab)
-        end
-        @errors[:keyword] &&= hash[:keyword].empty?
+      # The release asset uses the tag with the slash replaced by a hyphen,
+      # which encodes both the document id and the release qualifier
+      # (e.g. `ed1`, `ed1-wd`).
+      def asset_stem(hit)
+        (hit["source"] && hit["source"]["tag"] || "").tr("/", "-")
       end
-      def hash_to_ext(hash)
-        return unless hash[:ext]
-        hash_to_doctype hash[:ext]
-        hash[:ext] = Ext.new(flavor: "calconnect", **hash.delete(:ext))
+      def download_release_zip(hit)
+        url = release_zip_url(hit)
+        agent.get(url).body
+      rescue Mechanize::ResponseCodeError => e
+        raise "Failed to download release zip #{url}: HTTP #{e.response_code}"
       end
-      def hash_to_doctype(ext)
-        @errors[:doctype] &&= ext[:doctype].nil?
-        return unless ext[:doctype]
-        ext[:doctype] = Doctype.new content: ext.dig(:doctype, :type), abbreviation: ext.dig(:doctype, :abbreviation)
+      def agent
+        @agent ||= Mechanize.new
       end
-      def hash_to_editorialgroup(hash)
-        eg = hash.delete(:editorialgroup) || (hash[:ext] && hash[:ext].delete(:editorialgroup))
-        @errors[:editorialgroup] &&= eg.nil?
-        return unless eg
+      def extract_rxl(zip_data, filename)
+        Zip::File.open_buffer(StringIO.new(zip_data)) do |zip|
+          entry = zip.find_entry(filename)
+          raise "RXL file #{filename} not found in release zip" unless entry
-        # Normalize: editorialgroup can be a single hash or an array of hashes
-        groups = array(eg).map do |g|
-          g = g[:technical_committee] if g.is_a?(Hash) && g[:technical_committee]
-          g
+          return entry.get_input_stream.read
         end
-        subdivisions = groups.map do |g|
-          subdiv_name = Bib::TypedLocalizedString.new content: g[:name]
-          Bib::Subdivision.new(type: "technical-committee", name: [subdiv_name])
-        end
-        org_name = Bib::TypedLocalizedString.new content: "CalConnect"
-        org = Bib::Organization.new name: [org_name], subdivision: subdivisions
-        description = Bib::LocalizedMarkedUpString.new content: "committee"
-        role = Bib::Contributor::Role.new type: "author", description: [description]
-        hash[:contributor] ||= []
-        hash[:contributor] << Bib::Contributor.new(organization: org, role: [role])
-      end
-      def update_links(bib, links)
-        links.each do |l|
-          tu = l.transform_keys(&:to_sym)
-          bib.source << Relaton::Bib::Uri.new(**tu) unless bib.source(l[:type])
-        end
-        bib
       end
-      def update_sources(bib)
-        bib.source.each do |l|
-          uri = Addressable::URI.parse l.content
-          l.content = uri.merge(scheme: SCHEME, host: HOST).to_s unless uri.host
-        end
+      def normalize_rxl(xml)
+        xml.gsub(%r{(</?)technical-committee(>)}, '\1committee\2')
+          .gsub(%r{type="(?:csd|CC)"(?=>)}i, '\0 primary="true"')
+          .gsub(%r{type="Technical committee"}, 'type="technical-committee"')
       end
     end
   end

data/lib/relaton/calconnect/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Relaton
   module Calconnect
-    VERSION = "2.1.1".freeze
+    VERSION = "2.1.2".freeze
   end
 end

data/relaton-calconnect.gemspec CHANGED Viewed

@@ -26,9 +26,10 @@ Gem::Specification.new do |spec|
   spec.require_paths = ["lib"]
   spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
-  spec.add_dependency "faraday", "~> 2.7.0"
+  spec.add_dependency "mechanize", "~> 2.10"
   spec.add_dependency "relaton-bib", "~> 2.1.0"
   spec.add_dependency "relaton-core", "~> 0.0.12"
   spec.add_dependency "addressable", "~> 2.8"
   spec.add_dependency "relaton-index", "~> 0.2.0"
+  spec.add_dependency "rubyzip", "~> 2.3"
 end

metadata CHANGED Viewed

@@ -1,29 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: relaton-calconnect
 version: !ruby/object:Gem::Version
-  version: 2.1.1
+  version: 2.1.2
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-05-12 00:00:00.000000000 Z
+date: 2026-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: faraday
+  name: mechanize
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.7.0
+        version: '2.10'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.7.0
+        version: '2.10'
 - !ruby/object:Gem::Dependency
   name: relaton-bib
   requirement: !ruby/object:Gem::Requirement
@@ -80,6 +80,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 0.2.0
+- !ruby/object:Gem::Dependency
+  name: rubyzip
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.3'
 description: 'Relaton::Calconnect: retrieve CC Standards for bibliographic use using
   the BibliographicItem model'
 email: