bipm-data-importer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/bipm-fetch +84 -20
- data/lib/bipm/data/importer/asciimath.rb +3 -3
- data/lib/bipm/data/importer/common.rb +1 -1
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 738de7ebf04b508b4a769577c1c7f82eb81e48f1c401e0e7895197cbb33b65c0
         | 
| 4 | 
            +
              data.tar.gz: 37d6c10ed703778e8dd4c968ac2a3e174cc6cc937f6ab0bffb1ede6a3448d0c2
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: '06043389fc63d5038a836342accce06736f9fcb68e61aa309ce980ae439f69d66dd79ffec35003c2d6ea121a5d727bb06cb453a4860d1760adedaf52db9c98f7'
         | 
| 7 | 
            +
              data.tar.gz: 35e740c67821496b97012c251291bf755b19be2983f43f12129cc94b7ef3494fe8d51858269942ec275cb9e28bdec3f6f4d92049dc0c3bfd06e44ee6ed91f8e5
         | 
    
        data/exe/bipm-fetch
    CHANGED
    
    | @@ -29,22 +29,6 @@ bodies.each do |bodyid, bodyurl| | |
| 29 29 |  | 
| 30 30 | 
             
              body = bodyid.to_s.downcase.gsub(" ", "-").to_sym
         | 
| 31 31 |  | 
| 32 | 
            -
              meetings_en = VCR.use_cassette "#{body}/#{body}-meetings" do
         | 
| 33 | 
            -
                a.get "#{bodyurl}/meetings"
         | 
| 34 | 
            -
              end
         | 
| 35 | 
            -
              
         | 
| 36 | 
            -
              meetings_fr = VCR.use_cassette "#{body}/#{body}-meetings-fr" do
         | 
| 37 | 
            -
                a.get "#{bodyurl.gsub("/en/", "/fr/")}/meetings"
         | 
| 38 | 
            -
              end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
              publications_en = VCR.use_cassette "#{body}/#{body}-publications" do
         | 
| 41 | 
            -
                a.get "#{bodyurl}/publications"
         | 
| 42 | 
            -
              end
         | 
| 43 | 
            -
             | 
| 44 | 
            -
              publications_fr = VCR.use_cassette "#{body}/#{body}-publications-fr" do
         | 
| 45 | 
            -
                a.get "#{bodyurl.gsub("/en/", "/fr/")}/publications"
         | 
| 46 | 
            -
              end
         | 
| 47 | 
            -
             | 
| 48 32 | 
             
              resolutions = {}
         | 
| 49 33 | 
             
              %w[en fr].each do |meeting_lang|
         | 
| 50 34 | 
             
                next if ARGV[0] == '--fork' && fork
         | 
| @@ -52,8 +36,45 @@ bodies.each do |bodyid, bodyurl| | |
| 52 36 | 
             
                meeting_lang_sfx     = (meeting_lang == 'fr') ? "-fr" : ""
         | 
| 53 37 | 
             
                meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
         | 
| 54 38 |  | 
| 55 | 
            -
                 | 
| 56 | 
            -
             | 
| 39 | 
            +
                bodyurl_local = meeting_lang == "en" ? bodyurl : bodyurl.gsub("/en/", "/fr/")
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                cassfx = meeting_lang == "en" ? "" : "-fr"
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                pages = {}
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                pages[:index] = VCR.use_cassette "#{body}/#{body}-index#{meeting_lang_sfx}" do
         | 
| 46 | 
            +
                  a.get "#{bodyurl_local}"
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                pages[:meetings] = VCR.use_cassette "#{body}/#{body}-meetings#{meeting_lang_sfx}" do
         | 
| 50 | 
            +
                  a.get "#{bodyurl_local}/meetings"
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                pages[:publications] = VCR.use_cassette "#{body}/#{body}-publications#{meeting_lang_sfx}" do
         | 
| 54 | 
            +
                  a.get "#{bodyurl_local}/publications"
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # CIPM
         | 
| 58 | 
            +
                pages[:recommendations] = VCR.use_cassette "#{body}/#{body}-recommendations#{meeting_lang_sfx}" do
         | 
| 59 | 
            +
                  a.get "#{bodyurl_local}/recommendations"
         | 
| 60 | 
            +
                rescue Mechanize::ResponseCodeError
         | 
| 61 | 
            +
                  nil
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                # CIPM has outcomes, JCRB has meeting-outcomes
         | 
| 65 | 
            +
                # As of 2024-12, no other body has this special case.
         | 
| 66 | 
            +
                outcomes_path = bodyid == :CIPM ? "outcomes" : "meeting-outcomes"
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                pages[:outcomes] = VCR.use_cassette "#{body}/#{body}-outcomes#{meeting_lang_sfx}" do
         | 
| 69 | 
            +
                  a.get "#{bodyurl_local}/#{outcomes_path}"
         | 
| 70 | 
            +
                rescue Mechanize::ResponseCodeError
         | 
| 71 | 
            +
                  nil
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                meetings = pages[:meetings]
         | 
| 75 | 
            +
                publications = pages[:publications]
         | 
| 76 | 
            +
                recommendations = pages[:recommendations]
         | 
| 77 | 
            +
                outcomes = pages[:outcomes]
         | 
| 57 78 |  | 
| 58 79 | 
             
                index = {
         | 
| 59 80 | 
             
                          "meetings" => {"fr" => [], "en" => []}, 
         | 
| @@ -116,6 +137,17 @@ bodies.each do |bodyid, bodyurl| | |
| 116 137 | 
             
                      res_div.at_css('a').attr('href')
         | 
| 117 138 | 
             
                    end
         | 
| 118 139 |  | 
| 140 | 
            +
                    resolutions_additional = recommendations&.css(".bipm-resolutions .publications__content")&.map do |res_div|
         | 
| 141 | 
            +
                      href = res_div.at_css('a').attr('href')
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                      # bad case of french data...
         | 
| 144 | 
            +
                      href = href.gsub('/106-2017/', '/104-_1-2015/') if href =~ %r"/ci/cipm/106-2017/resolution-[12]\z"
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                      href
         | 
| 147 | 
            +
                    end&.select do |href|
         | 
| 148 | 
            +
                      href.include? "/#{ident}/"
         | 
| 149 | 
            +
                    end || []
         | 
| 150 | 
            +
             | 
| 119 151 | 
             
                    # A mistake on a website, resolution 2 listed twice...
         | 
| 120 152 | 
             
                    # https://www.bipm.org/fr/committees/ci/cipm/94-2005/
         | 
| 121 153 | 
             
                    if [bodyid, meeting_lang, meeting_id] == [:CIPM, 'fr', '94'] && resolutions.sort.uniq != resolutions.sort
         | 
| @@ -124,6 +156,8 @@ bodies.each do |bodyid, bodyurl| | |
| 124 156 | 
             
                      end
         | 
| 125 157 | 
             
                    end
         | 
| 126 158 |  | 
| 159 | 
            +
                    resolutions = (resolutions + resolutions_additional).uniq
         | 
| 160 | 
            +
             | 
| 127 161 | 
             
                    h["resolutions"] = resolutions.map do |href|
         | 
| 128 162 | 
             
                      href = href.gsub('/web/guest/', "/#{meeting_lang}/")
         | 
| 129 163 | 
             
                      href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
         | 
| @@ -160,7 +194,31 @@ bodies.each do |bodyid, bodyurl| | |
| 160 194 |  | 
| 161 195 | 
             
                    h["metadata"]["workgroup"] = wg if wg
         | 
| 162 196 |  | 
| 163 | 
            -
                     | 
| 197 | 
            +
                    decisions = meeting.css('.bipm-decisions .decisions')
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                    # For some bodies, decisions/outcomes are on a different page altogether.
         | 
| 200 | 
            +
                    # But then we must select only decisions pertaining to our meeting.
         | 
| 201 | 
            +
                    if outcomes
         | 
| 202 | 
            +
                      decisions_additional = outcomes.css('.bipm-decisions .decisions')
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                      decisions_additional = decisions_additional.select do |i|
         | 
| 205 | 
            +
                        pass = true if i["data-meeting_key"] == meeting_id
         | 
| 206 | 
            +
                        pass = true if i["data-meeting_key"] == "#{meeting_id}-0" # Some are with a 0, some without
         | 
| 207 | 
            +
                        pass = true if i["data-meeting"] == meeting_id # Some don't have meeting_key set, but have meeting set
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                        pass
         | 
| 210 | 
            +
                      end
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                      decisions = decisions.to_a + decisions_additional.to_a
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                      # duplicates check...
         | 
| 215 | 
            +
                      duplicates = decisions.map{|i|i.at_css('.title-third').text}
         | 
| 216 | 
            +
                      if duplicates != duplicates.uniq
         | 
| 217 | 
            +
                        pp [:duplicates_found, decisions]
         | 
| 218 | 
            +
                      end
         | 
| 219 | 
            +
                    end
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                    h["resolutions"] = decisions.map do |titletr|
         | 
| 164 222 | 
             
                      title = titletr.at_css('.title-third').text.strip
         | 
| 165 223 |  | 
| 166 224 | 
             
                      type = case title
         | 
| @@ -176,6 +234,9 @@ bodies.each do |bodyid, bodyurl| | |
| 176 234 | 
             
                        "decision"
         | 
| 177 235 | 
             
                      end
         | 
| 178 236 |  | 
| 237 | 
            +
                      categories = titletr.attr('data-decisioncategories')
         | 
| 238 | 
            +
                      categories ||= "[]"
         | 
| 239 | 
            +
             | 
| 179 240 | 
             
                      r = {
         | 
| 180 241 | 
             
                        "dates" => [date.to_s],
         | 
| 181 242 | 
             
                        "subject" => bodyid.to_s,
         | 
| @@ -185,7 +246,7 @@ bodies.each do |bodyid, bodyurl| | |
| 185 246 | 
             
                        "url" => meeting.uri.to_s,
         | 
| 186 247 | 
             
                        #TODO: "reference" => meeting.uri.merge(titletr.attr('data-link')).to_s,
         | 
| 187 248 |  | 
| 188 | 
            -
                        "categories" => JSON.parse( | 
| 249 | 
            +
                        "categories" => JSON.parse(categories).map(&:strip).uniq,
         | 
| 189 250 |  | 
| 190 251 | 
             
                        "considerations" => [],
         | 
| 191 252 | 
             
                        "actions" => [],
         | 
| @@ -329,6 +390,9 @@ bodies.each do |bodyid, bodyurl| | |
| 329 390 | 
             
                  wg = hs.first["metadata"]["workgroup"]
         | 
| 330 391 | 
             
                  if wg
         | 
| 331 392 | 
             
                    fn = "#{BASE_DIR}/#{body}/workgroups/#{wg}/meetings#{meeting_lang_sfx_dir}/meeting-#{mid}.yml"
         | 
| 393 | 
            +
                  elsif body == :cgpm
         | 
| 394 | 
            +
                    # CGPM old script used numbering like 00, 01, ..., 11, ...
         | 
| 395 | 
            +
                    fn = "#{BASE_DIR}/#{body}/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % mid}.yml"
         | 
| 332 396 | 
             
                  else
         | 
| 333 397 | 
             
                    fn = "#{BASE_DIR}/#{body}/meetings#{meeting_lang_sfx_dir}/meeting-#{mid}.yml"
         | 
| 334 398 | 
             
                  end
         | 
| @@ -9,10 +9,10 @@ module Bipm | |
| 9 9 | 
             
                    SPACE_AFTER=/(?:\Z|(?= |\s|[,()\/.~"^]))/
         | 
| 10 10 |  | 
| 11 11 | 
             
                    PREFIXES = /m|c|d|k|M|G|T|/
         | 
| 12 | 
            -
                    UNITS = /t|m|mol|cal|µ|s|g|W|cd|Hz|J|K|N|V|H|A|C|F|T|Wb|sr|lx|lm|bar|sb|h|rad|°C|°F|°K/
         | 
| 12 | 
            +
                    UNITS = /t|m|mol|cal|µ|s|g|W|cd|Hz|J|K|N|V|H|A|C|F|T|Wb|sr|lx|lm|bar|sb|h|rad|fm|°C|°F|°K/
         | 
| 13 13 |  | 
| 14 14 | 
             
                    def asciidoc_extract_math str
         | 
| 15 | 
            -
                      str.gsub(/\b_(#{MATH}{1,3})_/, 'stem:[\1]')
         | 
| 15 | 
            +
                      str.gsub(/\b_?_(#{MATH}{1,3})_?_/, 'stem:[\1]')
         | 
| 16 16 | 
             
                        .gsub("_,_", ',') # Some mistake in formatting
         | 
| 17 17 | 
             
                        .gsub("^er^", 'ESCUPerESCUP') # French specialities
         | 
| 18 18 | 
             
                        .gsub(/(bar|A) (table|of|key|de|being|full|1)( |,)/, 'ESC\1 \2\3') # A is Ampere, but also a particle, bar is a bar but also a bar
         | 
| @@ -69,4 +69,4 @@ module Bipm | |
| 69 69 | 
             
                  end
         | 
| 70 70 | 
             
                end
         | 
| 71 71 | 
             
              end
         | 
| 72 | 
            -
            end
         | 
| 72 | 
            +
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: bipm-data-importer
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.2. | 
| 4 | 
            +
              version: 0.2.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Ribose
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2024-12- | 
| 11 | 
            +
            date: 2024-12-14 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: nokogiri
         |