relaton-iso 1.20.0 → 2.0.0.pre.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile +1 -0
- data/README.adoc +134 -130
- data/bin/console +1 -1
- data/grammars/basicdoc.rng +2110 -0
- data/grammars/biblio-standoc.rng +287 -0
- data/grammars/biblio.rng +2097 -0
- data/grammars/relaton-iso-compile.rng +11 -0
- data/grammars/relaton-iso.rng +214 -0
- data/lib/relaton/iso/bibliography.rb +206 -0
- data/lib/relaton/iso/data_fetcher.rb +227 -0
- data/lib/relaton/iso/hash_parser_v1.rb +121 -0
- data/lib/relaton/iso/hit.rb +62 -0
- data/lib/relaton/iso/hit_collection.rb +117 -0
- data/lib/relaton/iso/item_data.rb +49 -0
- data/lib/relaton/iso/model/bibdata.rb +9 -0
- data/lib/relaton/iso/model/bibitem.rb +7 -0
- data/lib/relaton/iso/model/contributor.rb +7 -0
- data/lib/relaton/iso/model/contributor_info.rb +9 -0
- data/lib/relaton/iso/model/docidentifier.rb +128 -0
- data/lib/relaton/iso/model/doctype.rb +13 -0
- data/lib/relaton/iso/model/ext.rb +47 -0
- data/lib/relaton/iso/model/iso_project_group.rb +21 -0
- data/lib/relaton/iso/model/item.rb +17 -0
- data/lib/relaton/iso/model/item_base.rb +19 -0
- data/lib/relaton/iso/model/organization.rb +9 -0
- data/lib/relaton/iso/model/project_number.rb +22 -0
- data/lib/relaton/iso/model/relation.rb +9 -0
- data/lib/relaton/iso/model/stagename.rb +14 -0
- data/lib/relaton/iso/model/structured_identifier.rb +31 -0
- data/lib/relaton/iso/processor.rb +78 -0
- data/lib/relaton/iso/queue.rb +63 -0
- data/lib/relaton/iso/scraper.rb +591 -0
- data/lib/relaton/iso/util.rb +8 -0
- data/lib/relaton/iso/version.rb +7 -0
- data/lib/relaton/iso.rb +17 -0
- data/relaton_iso.gemspec +9 -7
- metadata +76 -46
- data/bin/bundle +0 -109
- data/bin/byebug +0 -27
- data/bin/coderay +0 -27
- data/bin/gdb_wrapper +0 -29
- data/bin/htmldiff +0 -27
- data/bin/httpclient +0 -29
- data/bin/ldiff +0 -27
- data/bin/nokogiri +0 -27
- data/bin/pry +0 -27
- data/bin/pubid-nist +0 -27
- data/bin/racc +0 -27
- data/bin/rackup +0 -29
- data/bin/rake +0 -27
- data/bin/rubocop +0 -27
- data/bin/ruby-parse +0 -27
- data/bin/ruby-rewrite +0 -27
- data/bin/safe_yaml +0 -29
- data/bin/thor +0 -27
- data/lib/relaton_iso/data_fetcher.rb +0 -246
- data/lib/relaton_iso/document_identifier.rb +0 -46
- data/lib/relaton_iso/hash_converter.rb +0 -15
- data/lib/relaton_iso/hit.rb +0 -59
- data/lib/relaton_iso/hit_collection.rb +0 -100
- data/lib/relaton_iso/iso_bibliography.rb +0 -202
- data/lib/relaton_iso/processor.rb +0 -67
- data/lib/relaton_iso/queue.rb +0 -61
- data/lib/relaton_iso/scrapper.rb +0 -553
- data/lib/relaton_iso/util.rb +0 -6
- data/lib/relaton_iso/version.rb +0 -5
- data/lib/relaton_iso.rb +0 -17
| @@ -0,0 +1,11 @@ | |
| 1 | 
            +
            <?xml version="1.0" encoding="UTF-8"?>
         | 
| 2 | 
            +
            <grammar xmlns="http://relaxng.org/ns/structure/1.0">
         | 
| 3 | 
            +
              <include href="basicdoc.rng"/>
         | 
| 4 | 
            +
              <include href="relaton-iso.rng"/>
         | 
| 5 | 
            +
              <start>
         | 
| 6 | 
            +
                <choice>
         | 
| 7 | 
            +
                  <ref name="bibitem"/>
         | 
| 8 | 
            +
                  <ref name="bibdata"/>
         | 
| 9 | 
            +
                </choice>
         | 
| 10 | 
            +
              </start>
         | 
| 11 | 
            +
            </grammar>
         | 
| @@ -0,0 +1,214 @@ | |
| 1 | 
            +
            <?xml version="1.0" encoding="UTF-8"?>
         | 
| 2 | 
            +
            <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
         | 
| 3 | 
            +
              <include href="biblio-standoc.rng">
         | 
| 4 | 
            +
                <define name="BibDataExtensionType">
         | 
| 5 | 
            +
                  <optional>
         | 
| 6 | 
            +
                    <attribute name="schema-version"/>
         | 
| 7 | 
            +
                  </optional>
         | 
| 8 | 
            +
                  <ref name="doctype"/>
         | 
| 9 | 
            +
                  <optional>
         | 
| 10 | 
            +
                    <ref name="docsubtype"/>
         | 
| 11 | 
            +
                  </optional>
         | 
| 12 | 
            +
                  <ref name="flavor"/>
         | 
| 13 | 
            +
                  <optional>
         | 
| 14 | 
            +
                    <ref name="horizontal"/>
         | 
| 15 | 
            +
                  </optional>
         | 
| 16 | 
            +
                  <ref name="editorialgroup"/>
         | 
| 17 | 
            +
                  <optional>
         | 
| 18 | 
            +
                    <ref name="approvalgroup"/>
         | 
| 19 | 
            +
                  </optional>
         | 
| 20 | 
            +
                  <zeroOrMore>
         | 
| 21 | 
            +
                    <ref name="ics"/>
         | 
| 22 | 
            +
                  </zeroOrMore>
         | 
| 23 | 
            +
                  <ref name="structuredidentifier"/>
         | 
| 24 | 
            +
                  <optional>
         | 
| 25 | 
            +
                    <ref name="stagename"/>
         | 
| 26 | 
            +
                  </optional>
         | 
| 27 | 
            +
                  <optional>
         | 
| 28 | 
            +
                    <ref name="updates_document_type"/>
         | 
| 29 | 
            +
                  </optional>
         | 
| 30 | 
            +
                  <optional>
         | 
| 31 | 
            +
                    <ref name="fast_track"/>
         | 
| 32 | 
            +
                  </optional>
         | 
| 33 | 
            +
                  <optional>
         | 
| 34 | 
            +
                    <ref name="price-code"/>
         | 
| 35 | 
            +
                  </optional>
         | 
| 36 | 
            +
                </define>
         | 
| 37 | 
            +
                <define name="bdate">
         | 
| 38 | 
            +
                  <element name="date">
         | 
| 39 | 
            +
                    <attribute name="type">
         | 
| 40 | 
            +
                      <choice>
         | 
| 41 | 
            +
                        <ref name="BibliographicDateType"/>
         | 
| 42 | 
            +
                        <text/>
         | 
| 43 | 
            +
                      </choice>
         | 
| 44 | 
            +
                    </attribute>
         | 
| 45 | 
            +
                    <choice>
         | 
| 46 | 
            +
                      <group>
         | 
| 47 | 
            +
                        <element name="from">
         | 
| 48 | 
            +
                          <ref name="ISO8601Date"/>
         | 
| 49 | 
            +
                        </element>
         | 
| 50 | 
            +
                        <optional>
         | 
| 51 | 
            +
                          <element name="to">
         | 
| 52 | 
            +
                            <ref name="ISO8601Date"/>
         | 
| 53 | 
            +
                          </element>
         | 
| 54 | 
            +
                        </optional>
         | 
| 55 | 
            +
                      </group>
         | 
| 56 | 
            +
                      <element name="on">
         | 
| 57 | 
            +
                        <choice>
         | 
| 58 | 
            +
                          <ref name="ISO8601Date"/>
         | 
| 59 | 
            +
                          <value>--</value>
         | 
| 60 | 
            +
                          <value>–</value>
         | 
| 61 | 
            +
                        </choice>
         | 
| 62 | 
            +
                      </element>
         | 
| 63 | 
            +
                    </choice>
         | 
| 64 | 
            +
                  </element>
         | 
| 65 | 
            +
                </define>
         | 
| 66 | 
            +
                <define name="DocumentType">
         | 
| 67 | 
            +
                  <choice>
         | 
| 68 | 
            +
                    <value>international-standard</value>
         | 
| 69 | 
            +
                    <value>technical-specification</value>
         | 
| 70 | 
            +
                    <value>technical-report</value>
         | 
| 71 | 
            +
                    <value>publicly-available-specification</value>
         | 
| 72 | 
            +
                    <value>international-workshop-agreement</value>
         | 
| 73 | 
            +
                    <value>guide</value>
         | 
| 74 | 
            +
                    <value>recommendation</value>
         | 
| 75 | 
            +
                    <value>amendment</value>
         | 
| 76 | 
            +
                    <value>technical-corrigendum</value>
         | 
| 77 | 
            +
                    <value>directive</value>
         | 
| 78 | 
            +
                    <value>committee-document</value>
         | 
| 79 | 
            +
                    <value>addendum</value>
         | 
| 80 | 
            +
                  </choice>
         | 
| 81 | 
            +
                </define>
         | 
| 82 | 
            +
                <define name="DocumentSubtype">
         | 
| 83 | 
            +
                  <choice>
         | 
| 84 | 
            +
                    <value>specification</value>
         | 
| 85 | 
            +
                    <value>method-of-test</value>
         | 
| 86 | 
            +
                    <value>vocabulary</value>
         | 
| 87 | 
            +
                    <value>code-of-practice</value>
         | 
| 88 | 
            +
                  </choice>
         | 
| 89 | 
            +
                </define>
         | 
| 90 | 
            +
                <define name="structuredidentifier">
         | 
| 91 | 
            +
                  <element name="structuredidentifier">
         | 
| 92 | 
            +
                    <optional>
         | 
| 93 | 
            +
                      <attribute name="type"/>
         | 
| 94 | 
            +
                    </optional>
         | 
| 95 | 
            +
                    <group>
         | 
| 96 | 
            +
                      <ref name="documentnumber"/>
         | 
| 97 | 
            +
                      <optional>
         | 
| 98 | 
            +
                        <ref name="tc-documentnumber"/>
         | 
| 99 | 
            +
                      </optional>
         | 
| 100 | 
            +
                    </group>
         | 
| 101 | 
            +
                  </element>
         | 
| 102 | 
            +
                </define>
         | 
| 103 | 
            +
                <define name="editorialgroup">
         | 
| 104 | 
            +
                  <element name="editorialgroup">
         | 
| 105 | 
            +
                    <ref name="ISOProjectGroup"/>
         | 
| 106 | 
            +
                  </element>
         | 
| 107 | 
            +
                </define>
         | 
| 108 | 
            +
              </include>
         | 
| 109 | 
            +
              <define name="updates_document_type">
         | 
| 110 | 
            +
                <element name="updates-document-type">
         | 
| 111 | 
            +
                  <ref name="DocumentType"/>
         | 
| 112 | 
            +
                </element>
         | 
| 113 | 
            +
              </define>
         | 
| 114 | 
            +
              <define name="ISOProjectGroup">
         | 
| 115 | 
            +
                <zeroOrMore>
         | 
| 116 | 
            +
                  <ref name="agency"/>
         | 
| 117 | 
            +
                </zeroOrMore>
         | 
| 118 | 
            +
                <oneOrMore>
         | 
| 119 | 
            +
                  <ref name="technical-committee"/>
         | 
| 120 | 
            +
                </oneOrMore>
         | 
| 121 | 
            +
                <zeroOrMore>
         | 
| 122 | 
            +
                  <ref name="subcommittee"/>
         | 
| 123 | 
            +
                </zeroOrMore>
         | 
| 124 | 
            +
                <zeroOrMore>
         | 
| 125 | 
            +
                  <ref name="workgroup"/>
         | 
| 126 | 
            +
                </zeroOrMore>
         | 
| 127 | 
            +
                <optional>
         | 
| 128 | 
            +
                  <ref name="secretariat"/>
         | 
| 129 | 
            +
                </optional>
         | 
| 130 | 
            +
              </define>
         | 
| 131 | 
            +
              <define name="approvalgroup">
         | 
| 132 | 
            +
                <element name="approvalgroup">
         | 
| 133 | 
            +
                  <ref name="ISOProjectGroup"/>
         | 
| 134 | 
            +
                </element>
         | 
| 135 | 
            +
              </define>
         | 
| 136 | 
            +
              <define name="agency">
         | 
| 137 | 
            +
                <element name="agency">
         | 
| 138 | 
            +
                  <text/>
         | 
| 139 | 
            +
                </element>
         | 
| 140 | 
            +
              </define>
         | 
| 141 | 
            +
              <define name="horizontal">
         | 
| 142 | 
            +
                <element name="horizontal">
         | 
| 143 | 
            +
                  <data type="boolean"/>
         | 
| 144 | 
            +
                </element>
         | 
| 145 | 
            +
              </define>
         | 
| 146 | 
            +
              <define name="documentnumber">
         | 
| 147 | 
            +
                <element name="project-number">
         | 
| 148 | 
            +
                  <optional>
         | 
| 149 | 
            +
                    <attribute name="part">
         | 
| 150 | 
            +
                      <data type="int"/>
         | 
| 151 | 
            +
                    </attribute>
         | 
| 152 | 
            +
                  </optional>
         | 
| 153 | 
            +
                  <optional>
         | 
| 154 | 
            +
                    <attribute name="subpart">
         | 
| 155 | 
            +
                      <data type="int"/>
         | 
| 156 | 
            +
                    </attribute>
         | 
| 157 | 
            +
                  </optional>
         | 
| 158 | 
            +
                  <optional>
         | 
| 159 | 
            +
                    <attribute name="amendment">
         | 
| 160 | 
            +
                      <data type="int"/>
         | 
| 161 | 
            +
                    </attribute>
         | 
| 162 | 
            +
                  </optional>
         | 
| 163 | 
            +
                  <optional>
         | 
| 164 | 
            +
                    <attribute name="corrigendum">
         | 
| 165 | 
            +
                      <data type="int"/>
         | 
| 166 | 
            +
                    </attribute>
         | 
| 167 | 
            +
                  </optional>
         | 
| 168 | 
            +
                  <optional>
         | 
| 169 | 
            +
                    <attribute name="origyr">
         | 
| 170 | 
            +
                      <ref name="ISO8601Date"/>
         | 
| 171 | 
            +
                    </attribute>
         | 
| 172 | 
            +
                  </optional>
         | 
| 173 | 
            +
                  <text/>
         | 
| 174 | 
            +
                </element>
         | 
| 175 | 
            +
              </define>
         | 
| 176 | 
            +
              <define name="tc-documentnumber">
         | 
| 177 | 
            +
                <element name="tc-document-number">
         | 
| 178 | 
            +
                  <data type="int"/>
         | 
| 179 | 
            +
                </element>
         | 
| 180 | 
            +
              </define>
         | 
| 181 | 
            +
              <define name="subcommittee">
         | 
| 182 | 
            +
                <element name="subcommittee">
         | 
| 183 | 
            +
                  <ref name="IsoWorkgroup"/>
         | 
| 184 | 
            +
                </element>
         | 
| 185 | 
            +
              </define>
         | 
| 186 | 
            +
              <define name="workgroup">
         | 
| 187 | 
            +
                <element name="workgroup">
         | 
| 188 | 
            +
                  <ref name="IsoWorkgroup"/>
         | 
| 189 | 
            +
                </element>
         | 
| 190 | 
            +
              </define>
         | 
| 191 | 
            +
              <define name="secretariat">
         | 
| 192 | 
            +
                <element name="secretariat">
         | 
| 193 | 
            +
                  <text/>
         | 
| 194 | 
            +
                </element>
         | 
| 195 | 
            +
              </define>
         | 
| 196 | 
            +
              <define name="stagename">
         | 
| 197 | 
            +
                <element name="stagename">
         | 
| 198 | 
            +
                  <optional>
         | 
| 199 | 
            +
                    <attribute name="abbreviation"/>
         | 
| 200 | 
            +
                  </optional>
         | 
| 201 | 
            +
                  <text/>
         | 
| 202 | 
            +
                </element>
         | 
| 203 | 
            +
              </define>
         | 
| 204 | 
            +
              <define name="fast_track">
         | 
| 205 | 
            +
                <element name="fast-track">
         | 
| 206 | 
            +
                  <data type="boolean"/>
         | 
| 207 | 
            +
                </element>
         | 
| 208 | 
            +
              </define>
         | 
| 209 | 
            +
              <define name="price-code">
         | 
| 210 | 
            +
                <element name="price-code">
         | 
| 211 | 
            +
                  <text/>
         | 
| 212 | 
            +
                </element>
         | 
| 213 | 
            +
              </define>
         | 
| 214 | 
            +
            </grammar>
         | 
| @@ -0,0 +1,206 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # require 'relaton_iso/iso_bibliographic_item'
         | 
| 4 | 
            +
            # require "relaton_iso/scrapper"
         | 
| 5 | 
            +
            # require "relaton_iso/hit_collection"
         | 
| 6 | 
            +
            # require "relaton_iec"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            module Relaton
         | 
| 9 | 
            +
              module Iso
         | 
| 10 | 
            +
                # Methods for search ISO standards.
         | 
| 11 | 
            +
                module Bibliography
         | 
| 12 | 
            +
                  extend self
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  # @param text [Pubid::Iso::Identifier, String]
         | 
| 15 | 
            +
                  # @return [RelatonIso::HitCollection]
         | 
| 16 | 
            +
                  def search(pubid, opts = {})
         | 
| 17 | 
            +
                    pubid = ::Pubid::Iso::Identifier.parse(pubid) if pubid.is_a? String
         | 
| 18 | 
            +
                    HitCollection.new(pubid, opts).find
         | 
| 19 | 
            +
                  rescue  SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
         | 
| 20 | 
            +
                          EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
         | 
| 21 | 
            +
                          Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT => e
         | 
| 22 | 
            +
                    raise Relaton::RequestError, e.message
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  # @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
         | 
| 26 | 
            +
                  # @param year [String, NilClass] the year the standard was published
         | 
| 27 | 
            +
                  # @param opts [Hash] options; restricted to :all_parts if all-parts
         | 
| 28 | 
            +
                  # @option opts [Boolean] :all_parts if all-parts reference is required
         | 
| 29 | 
            +
                  # @option opts [Boolean] :keep_year if undated reference should return
         | 
| 30 | 
            +
                  #   actual reference with year
         | 
| 31 | 
            +
                  #
         | 
| 32 | 
            +
                  # @return [RelatonIsoBib::IsoBibliographicItem] Bibliographic item
         | 
| 33 | 
            +
                  def get(ref, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity,Metrics/AbcSize
         | 
| 34 | 
            +
                    code = ref.gsub("\u2013", "-")
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    # parse "all parts" request
         | 
| 37 | 
            +
                    # code.sub! " (all parts)", ""
         | 
| 38 | 
            +
                    # opts[:all_parts] ||= $~ && opts[:all_parts].nil?
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    query_pubid = ::Pubid::Iso::Identifier.parse(code)
         | 
| 41 | 
            +
                    query_pubid.root.year = year.to_i if year&.respond_to?(:to_i)
         | 
| 42 | 
            +
                    query_pubid.root.all_parts ||= opts[:all_parts]
         | 
| 43 | 
            +
                    Util.info "Fetching from Relaton repository ...", key: query_pubid.to_s
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    hits, missed_year_ids = isobib_search_filter(query_pubid, opts)
         | 
| 46 | 
            +
                    tip_ids = look_up_with_any_types_stages(hits, ref, opts)
         | 
| 47 | 
            +
                    ret = hits.fetch_doc
         | 
| 48 | 
            +
                    return fetch_ref_err(query_pubid, missed_year_ids, tip_ids) unless ret
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    response_pubid = ret.docidentifier.find(&:primary) # .sub(" (all parts)", "")
         | 
| 51 | 
            +
                    Util.info "Found: `#{response_pubid}`", key: query_pubid.to_s
         | 
| 52 | 
            +
                    get_all = (query_pubid.root.year && opts[:keep_year].nil?) || opts[:keep_year] || opts[:all_parts]
         | 
| 53 | 
            +
                    return ret if get_all
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    ret.to_most_recent_reference
         | 
| 56 | 
            +
                  rescue ::Pubid::Core::Errors::ParseError
         | 
| 57 | 
            +
                    Util.warn "Is not recognized as a standards identifier.", key: code
         | 
| 58 | 
            +
                    nil
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  # @param query_pubid [Pubid::Iso::Identifier]
         | 
| 62 | 
            +
                  # @param pubid [Pubid::Iso::Identifier]
         | 
| 63 | 
            +
                  # @param all_parts [Boolean] match with any parts when true
         | 
| 64 | 
            +
                  # @return [Boolean]
         | 
| 65 | 
            +
                  def matches_parts?(query_pubid, pubid, all_parts: false)
         | 
| 66 | 
            +
                    # match only with documents with part number
         | 
| 67 | 
            +
                    return !pubid.part.nil? if all_parts
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    query_pubid.part == pubid.part
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                  #
         | 
| 73 | 
            +
                  # Matches base of query_pubid and pubid.
         | 
| 74 | 
            +
                  #
         | 
| 75 | 
            +
                  # @param [Pubid::Iso::Identifier] query_pubid pubid to match
         | 
| 76 | 
            +
                  # @param [Pubid::Iso::Identifier] pubid pubid to match
         | 
| 77 | 
            +
                  # @param [Boolean] any_types_stages match with any types and stages
         | 
| 78 | 
            +
                  #
         | 
| 79 | 
            +
                  # @return [<Type>] <description>
         | 
| 80 | 
            +
                  #
         | 
| 81 | 
            +
                  def matches_base?(query_pubid, pubid, any_types_stages: false) # rubocop:disable Metrics?PerceivedComplexity
         | 
| 82 | 
            +
                    return false unless pubid.respond_to?(:publisher)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    query_pubid.publisher == pubid.publisher &&
         | 
| 85 | 
            +
                      query_pubid.number == pubid.number &&
         | 
| 86 | 
            +
                      query_pubid.copublisher == pubid.copublisher &&
         | 
| 87 | 
            +
                      (any_types_stages || query_pubid.stage == pubid.stage) &&
         | 
| 88 | 
            +
                      (any_types_stages || query_pubid.is_a?(pubid.class))
         | 
| 89 | 
            +
                  end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                  # @param hit_collection [RelatonIso::HitCollection]
         | 
| 92 | 
            +
                  # @param year [String]
         | 
| 93 | 
            +
                  # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
         | 
| 94 | 
            +
                  def filter_hits_by_year(hit_collection, year)
         | 
| 95 | 
            +
                    missed_year_ids = Set.new
         | 
| 96 | 
            +
                    return [hit_collection, missed_year_ids] if year.nil?
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    # filter by year
         | 
| 99 | 
            +
                    hit_collection.select! do |hit|
         | 
| 100 | 
            +
                      hit.pubid.year ||= hit.hit[:year]
         | 
| 101 | 
            +
                      next true if check_year(year, hit)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                      missed_year_ids << hit.pubid.to_s if hit.pubid.year
         | 
| 104 | 
            +
                      false
         | 
| 105 | 
            +
                    end
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    [hit_collection, missed_year_ids]
         | 
| 108 | 
            +
                  end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                  private
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                  def check_year(year, hit) # rubocop:disable Metrics/AbcSize
         | 
| 113 | 
            +
                    (hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s) ||
         | 
| 114 | 
            +
                      (!hit.pubid.base.nil? && hit.pubid.base.year.to_s == year.to_s) ||
         | 
| 115 | 
            +
                      (!hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s)
         | 
| 116 | 
            +
                  end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                  # @param pubid [Pubid::Iso::Identifier] PubID with no results
         | 
| 119 | 
            +
                  def fetch_ref_err(pubid, missed_year_ids, tip_ids) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
         | 
| 120 | 
            +
                    Util.info "Not found.", key: pubid.to_s
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    if missed_year_ids.any?
         | 
| 123 | 
            +
                      ids = missed_year_ids.map { |i| "`#{i}`" }.join(", ")
         | 
| 124 | 
            +
                      Util.info "TIP: No match for edition year #{pubid.year}, but matches exist for #{ids}.", key: pubid.to_s
         | 
| 125 | 
            +
                    end
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                    if tip_ids.any?
         | 
| 128 | 
            +
                      ids = tip_ids.map { |i| "`#{i}`" }.join(", ")
         | 
| 129 | 
            +
                      Util.info "TIP: Matches exist for #{ids}.", key: pubid.to_s
         | 
| 130 | 
            +
                    end
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                    if pubid.part
         | 
| 133 | 
            +
                      Util.info "TIP: If it cannot be found, the document may no longer be published in parts.", key: pubid.to_s
         | 
| 134 | 
            +
                    else
         | 
| 135 | 
            +
                      Util.info "TIP: If you wish to cite all document parts for the reference, " \
         | 
| 136 | 
            +
                                "use `#{pubid.to_s(format: :ref_undated)} (all parts)`.", key: pubid.to_s
         | 
| 137 | 
            +
                    end
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    nil
         | 
| 140 | 
            +
                  end
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                  def look_up_with_any_types_stages(hits, ref, opts)
         | 
| 143 | 
            +
                    return [] if hits.any? || !ref.match?(/^ISO[\/\s][A-Z]/)
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    ref_no_type_stage = ref.sub(/^ISO[\/\s][A-Z]+/, "ISO")
         | 
| 146 | 
            +
                    pubid = ::Pubid::Iso::Identifier.parse(ref_no_type_stage)
         | 
| 147 | 
            +
                    resp, = isobib_search_filter(pubid, opts, any_types_stages: true)
         | 
| 148 | 
            +
                    resp.map &:pubid
         | 
| 149 | 
            +
                  end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                  #
         | 
| 152 | 
            +
                  # Search for hits. If no found then trying missed stages.
         | 
| 153 | 
            +
                  #
         | 
| 154 | 
            +
                  # @param query_pubid [Pubid::Iso::Identifier] reference without correction
         | 
| 155 | 
            +
                  # @param opts [Hash]
         | 
| 156 | 
            +
                  # @param any_types_stages [Boolean] match with any stages
         | 
| 157 | 
            +
                  #
         | 
| 158 | 
            +
                  # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed years
         | 
| 159 | 
            +
                  #
         | 
| 160 | 
            +
                  def isobib_search_filter(query_pubid, opts, any_types_stages: false)
         | 
| 161 | 
            +
                    hit_collection = search(query_pubid, opts)
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                    # filter only matching hits
         | 
| 164 | 
            +
                    filter_hits hit_collection, query_pubid, any_types_stages
         | 
| 165 | 
            +
                  end
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                  #
         | 
| 168 | 
            +
                  # Filter hits by query_pubid.
         | 
| 169 | 
            +
                  #
         | 
| 170 | 
            +
                  # @param hit_collection [RelatonIso::HitCollection]
         | 
| 171 | 
            +
                  # @param query_pubid [Pubid::Iso::Identifier]
         | 
| 172 | 
            +
                  # @param all_parts [Boolean]
         | 
| 173 | 
            +
                  # @param any_types_stages [Boolean]
         | 
| 174 | 
            +
                  #
         | 
| 175 | 
            +
                  # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
         | 
| 176 | 
            +
                  #
         | 
| 177 | 
            +
                  def filter_hits(hit_collection, query_pubid, any_types_stages) # rubocop:disable Metrics/AbcSize
         | 
| 178 | 
            +
                    # filter out
         | 
| 179 | 
            +
                    excludings = build_excludings(query_pubid.root.all_parts, any_types_stages)
         | 
| 180 | 
            +
                    no_year_ref = hit_collection.ref_pubid_no_year.exclude(*excludings)
         | 
| 181 | 
            +
                    hit_collection.select! do |i|
         | 
| 182 | 
            +
                      pubid_match?(i.pubid, query_pubid, excludings, no_year_ref) &&
         | 
| 183 | 
            +
                        !(query_pubid.root.all_parts && i.pubid.part.nil?)
         | 
| 184 | 
            +
                    end
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                    filter_hits_by_year(hit_collection, query_pubid.root.year)
         | 
| 187 | 
            +
                  end
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                  def build_excludings(all_parts, any_types_stages)
         | 
| 190 | 
            +
                    excludings = %i[year edition all_parts]
         | 
| 191 | 
            +
                    excludings += %i[type stage iteration] if any_types_stages
         | 
| 192 | 
            +
                    excludings << :part if all_parts
         | 
| 193 | 
            +
                    excludings
         | 
| 194 | 
            +
                  end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                  def pubid_match?(pubid, query_pubid, excludings, no_year_ref)
         | 
| 197 | 
            +
                    if pubid.is_a? String then pubid == query_pubid.to_s
         | 
| 198 | 
            +
                    else
         | 
| 199 | 
            +
                      pubid = pubid.dup
         | 
| 200 | 
            +
                      pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
         | 
| 201 | 
            +
                      pubid.exclude(*excludings) == no_year_ref
         | 
| 202 | 
            +
                    end
         | 
| 203 | 
            +
                  end
         | 
| 204 | 
            +
                end
         | 
| 205 | 
            +
              end
         | 
| 206 | 
            +
            end
         | 
| @@ -0,0 +1,227 @@ | |
| 1 | 
            +
            require_relative "../iso"
         | 
| 2 | 
            +
            require_relative "queue"
         | 
| 3 | 
            +
            require_relative "scraper"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Relaton
         | 
| 6 | 
            +
              module Iso
         | 
| 7 | 
            +
                # Fetch all the documents from ISO website.
         | 
| 8 | 
            +
                class DataFetcher < Core::DataFetcher
         | 
| 9 | 
            +
                  def gh_issue_channel
         | 
| 10 | 
            +
                    ["relaton/relaton-iso", "Error fetching ISO documents"]
         | 
| 11 | 
            +
                  end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  #
         | 
| 14 | 
            +
                  # The queue is used to store the ICS page paths beeing fetching in the current run.
         | 
| 15 | 
            +
                  #
         | 
| 16 | 
            +
                  # @return [Queue] queue
         | 
| 17 | 
            +
                  #
         | 
| 18 | 
            +
                  def queue
         | 
| 19 | 
            +
                    @queue ||= ::Queue.new
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                  def mutex
         | 
| 23 | 
            +
                    @mutex ||= Mutex.new
         | 
| 24 | 
            +
                  end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  def log_error(msg)
         | 
| 27 | 
            +
                    Util.error msg
         | 
| 28 | 
            +
                  end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  def index
         | 
| 31 | 
            +
                    @index ||= Relaton::Index.find_or_create :iso, file: "#{HitCollection::INDEXFILE}.yaml"
         | 
| 32 | 
            +
                  end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  #
         | 
| 35 | 
            +
                  # ISO has too many docs. GHA can't get them all in one run.
         | 
| 36 | 
            +
                  # So, we need to split the process into several runs.
         | 
| 37 | 
            +
                  # The iso_queue is used to store the doc paths that have not been fetched.
         | 
| 38 | 
            +
                  #
         | 
| 39 | 
            +
                  # @return [Relaton::Iso::Queue] queue
         | 
| 40 | 
            +
                  #
         | 
| 41 | 
            +
                  def iso_queue
         | 
| 42 | 
            +
                    @iso_queue ||= Relaton::Iso::Queue.new
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  #
         | 
| 46 | 
            +
                  # Go through all ICS and fetch all documents.
         | 
| 47 | 
            +
                  #
         | 
| 48 | 
            +
                  # @return [void]
         | 
| 49 | 
            +
                  #
         | 
| 50 | 
            +
                  def fetch # rubocop:disable Metrics/AbcSize
         | 
| 51 | 
            +
                    Util.info "Scrapping ICS pages..."
         | 
| 52 | 
            +
                    fetch_ics
         | 
| 53 | 
            +
                    Util.info "(#{Time.now}) Scrapping documents..."
         | 
| 54 | 
            +
                    fetch_docs
         | 
| 55 | 
            +
                    iso_queue.save
         | 
| 56 | 
            +
                    # index.sort! { |a, b| compare_docids a, b }
         | 
| 57 | 
            +
                    index.save
         | 
| 58 | 
            +
                    repot_errors
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  private
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  #
         | 
| 64 | 
            +
                  # Fetch ICS page recursively and store all the links to documents in the iso_queue.
         | 
| 65 | 
            +
                  #
         | 
| 66 | 
            +
                  # @param [String] path path to ICS page
         | 
| 67 | 
            +
                  #
         | 
| 68 | 
            +
                  def fetch_ics
         | 
| 69 | 
            +
                    threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
         | 
| 70 | 
            +
                    fetch_ics_page "/standards-catalogue/browse-by-ics.html"
         | 
| 71 | 
            +
                    sleep(1) until queue.empty?
         | 
| 72 | 
            +
                    threads.size.times { queue << :END }
         | 
| 73 | 
            +
                    threads.each(&:join)
         | 
| 74 | 
            +
                  end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                  def fetch_ics_page(path)
         | 
| 77 | 
            +
                    resp = get_redirection path
         | 
| 78 | 
            +
                    unless resp
         | 
| 79 | 
            +
                      Util.error "Failed fetching ICS page #{url(path)}"
         | 
| 80 | 
            +
                      return
         | 
| 81 | 
            +
                    end
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    page = Nokogiri::HTML(resp.body)
         | 
| 84 | 
            +
                    parse_doc_links page
         | 
| 85 | 
            +
                    parse_ics_links page
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                  def parse_doc_links(page)
         | 
| 89 | 
            +
                    doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
         | 
| 90 | 
            +
                    @errors[:doc_links] &&= doc_links.empty?
         | 
| 91 | 
            +
                    doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
         | 
| 92 | 
            +
                  end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  def parse_ics_links(page)
         | 
| 95 | 
            +
                    ics_links = page.xpath("//td[@data-title='ICS']/a")
         | 
| 96 | 
            +
                    @errors[:ics_links] &&= ics_links.empty?
         | 
| 97 | 
            +
                    ics_links.each { |item| queue << item[:href] }
         | 
| 98 | 
            +
                  end
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                  def url(path)
         | 
| 101 | 
            +
                    Scraper::DOMAIN + path
         | 
| 102 | 
            +
                  end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                  #
         | 
| 105 | 
            +
                  # Get the page from the given path. If the page is redirected, get the
         | 
| 106 | 
            +
                  # page from the new path.
         | 
| 107 | 
            +
                  #
         | 
| 108 | 
            +
                  # @param [String] path path to the page
         | 
| 109 | 
            +
                  #
         | 
| 110 | 
            +
                  # @return [Net::HTTPOK, nil] HTTP response
         | 
| 111 | 
            +
                  #
         | 
| 112 | 
            +
                  def get_redirection(path) # rubocop:disable Metrics/MethodLength
         | 
| 113 | 
            +
                    try = 0
         | 
| 114 | 
            +
                    uri = URI url(path)
         | 
| 115 | 
            +
                    begin
         | 
| 116 | 
            +
                      get_response uri
         | 
| 117 | 
            +
                    rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
         | 
| 118 | 
            +
                      try += 1
         | 
| 119 | 
            +
                      retry if check_try try, uri
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                      Util.warn "Failed fetching #{uri}, #{e.message}"
         | 
| 122 | 
            +
                    end
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                  def get_response(uri)
         | 
| 126 | 
            +
                    resp = Net::HTTP.get_response(uri)
         | 
| 127 | 
            +
                    resp.code == "302" ? get_redirection(resp["location"]) : resp
         | 
| 128 | 
            +
                  end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                  def check_try(try, uri)
         | 
| 131 | 
            +
                    if try < 3
         | 
| 132 | 
            +
                      Util.warn "Timeout fetching #{uri}, retrying..."
         | 
| 133 | 
            +
                      sleep 1
         | 
| 134 | 
            +
                      true
         | 
| 135 | 
            +
                    end
         | 
| 136 | 
            +
                  end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                  def fetch_docs
         | 
| 139 | 
            +
                    threads = Array.new(3) { thread { |path| fetch_doc(path) } }
         | 
| 140 | 
            +
                    iso_queue[0..10_000].each { |docpath| queue << docpath }
         | 
| 141 | 
            +
                    threads.size.times { queue << :END }
         | 
| 142 | 
            +
                    threads.each(&:join)
         | 
| 143 | 
            +
                  end
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                  #
         | 
| 146 | 
            +
                  # Fetch document from ISO website.
         | 
| 147 | 
            +
                  #
         | 
| 148 | 
            +
                  # @param [String] docpath document page path
         | 
| 149 | 
            +
                  #
         | 
| 150 | 
            +
                  # @return [void]
         | 
| 151 | 
            +
                  #
         | 
| 152 | 
            +
                  def fetch_doc(docpath)
         | 
| 153 | 
            +
                    doc = Scraper.parse_page docpath, errors: @errors
         | 
| 154 | 
            +
                    mutex.synchronize { save_doc doc, docpath }
         | 
| 155 | 
            +
                  rescue StandardError => e
         | 
| 156 | 
            +
                    Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
         | 
| 157 | 
            +
                  end
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                  # def compare_docids(id1, id2)
         | 
| 160 | 
            +
                  #   Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
         | 
| 161 | 
            +
                  # end
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                  #
         | 
| 164 | 
            +
                  # save document to file.
         | 
| 165 | 
            +
                  #
         | 
| 166 | 
            +
                  # @param [RelatonIsoBib::IsoBibliographicItem] doc document
         | 
| 167 | 
            +
                  #
         | 
| 168 | 
            +
                  # @return [void]
         | 
| 169 | 
            +
                  #
         | 
| 170 | 
            +
                  def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
         | 
| 171 | 
            +
                    docid = doc.docidentifier.detect(&:primary)
         | 
| 172 | 
            +
                    file = output_file docid.content.to_s
         | 
| 173 | 
            +
                    if File.exist?(file)
         | 
| 174 | 
            +
                      rewrite_with_same_or_newer doc, docid, file, docpath
         | 
| 175 | 
            +
                    else
         | 
| 176 | 
            +
                      write_file file, doc, docid
         | 
| 177 | 
            +
                    end
         | 
| 178 | 
            +
                    iso_queue.move_last docpath
         | 
| 179 | 
            +
                  end
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                  def rewrite_with_same_or_newer(doc, docid, file, docpath)
         | 
| 182 | 
            +
                    bib = Item.from_yaml File.read(file, encoding: "UTF-8")
         | 
| 183 | 
            +
                    if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
         | 
| 184 | 
            +
                      write_file file, doc, docid
         | 
| 185 | 
            +
                    elsif @files.include?(file) && !edition_greater?(bib, doc)
         | 
| 186 | 
            +
                      Util.warn "Duplicate file `#{file}` for `#{docid.content}` from #{url(docpath)}"
         | 
| 187 | 
            +
                    end
         | 
| 188 | 
            +
                  end
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                  def edition_greater?(doc, bib)
         | 
| 191 | 
            +
                    doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
         | 
| 192 | 
            +
                  end
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                  def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
         | 
| 195 | 
            +
                    doc.edition&.content == bib.edition&.content &&
         | 
| 196 | 
            +
                      (doc.status&.substage&.content != "98" || bib.status&.substage&.content == "98")
         | 
| 197 | 
            +
                  end
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                  def write_file(file, doc, docid)
         | 
| 200 | 
            +
                    @files << file
         | 
| 201 | 
            +
                    index.add_or_update docid.content.to_h, file
         | 
| 202 | 
            +
                    File.write file, serialize(doc), encoding: "UTF-8"
         | 
| 203 | 
            +
                  end
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                  def to_yaml(doc)
         | 
| 206 | 
            +
                    Item.to_yaml doc
         | 
| 207 | 
            +
                  end
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                  def to_xml(doc)
         | 
| 210 | 
            +
                    Bibdata.to_xml doc
         | 
| 211 | 
            +
                  end
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                  #
         | 
| 214 | 
            +
                  # Create thread worker
         | 
| 215 | 
            +
                  #
         | 
| 216 | 
            +
                  # @return [Thread] thread
         | 
| 217 | 
            +
                  #
         | 
| 218 | 
            +
                  def thread
         | 
| 219 | 
            +
                    Thread.new do
         | 
| 220 | 
            +
                      while (path = queue.pop) != :END
         | 
| 221 | 
            +
                        yield path
         | 
| 222 | 
            +
                      end
         | 
| 223 | 
            +
                    end
         | 
| 224 | 
            +
                  end
         | 
| 225 | 
            +
                end
         | 
| 226 | 
            +
              end
         | 
| 227 | 
            +
            end
         |