bipm-data-importer 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fcde8a899380a032040ac0f44f380edda5b0f0a6c6591dc874a9e4f90504544
4
- data.tar.gz: ba4cce57619f4356eb66309d75347fa24db76ba67936fb0e0cc161b97745c13e
3
+ metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
4
+ data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
5
5
  SHA512:
6
- metadata.gz: db266f37ca351605df8e13ea5c75fae2fe9b9ddbd8038e4392233e2e103d28e1a33c11a7f1bf397c93a8e5f32afdb293045341a776fd09f16b4cae5a054ac340
7
- data.tar.gz: 46bec77135211aec51a90a03f21dbc72f9cc1443108b965be211449f89b963d90f87a58e354a5a33ec9a750fa0c0ea6357382f4ecbf1f76434c54bd3b3818a0f
6
+ metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
7
+ data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  spec.add_dependency "nokogiri"
32
32
  spec.add_dependency "mechanize"
33
- spec.add_dependency "reverse_adoc"
33
+ spec.add_dependency "coradoc"
34
34
  spec.add_dependency "pry"
35
35
 
36
36
  spec.add_dependency "vcr"
data/exe/bipm-fetch CHANGED
@@ -15,7 +15,8 @@ bodies = {
15
15
  "CCL": 'https://www.bipm.org/en/committees/cc/ccl',
16
16
  "CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
17
17
  "CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
18
- "CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
18
+ "CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
19
+ "CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
19
20
  }
20
21
 
21
22
  BASE_DIR = "data"
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
64
65
 
65
66
  title = meeting_div.at_css('.meetings-list__informations-title').text.strip
66
67
  href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
68
+ href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
67
69
 
68
70
  ident = href.split("/#{body}/").last.gsub('/', '.')
69
71
  yr = href.include?("/wg/") ? nil : href.split('-').last
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
124
126
 
125
127
  h["resolutions"] = resolutions.map do |href|
126
128
  href = href.gsub('/web/guest/', "/#{meeting_lang}/")
129
+ href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
127
130
 
128
131
  # error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
129
132
  href = href.gsub('/104-2015/', '/104-_1-2015/')
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
343
346
  "documents" => i.css(".publications__content").map do |d|
344
347
  {
345
348
  "title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
346
- "pdf" => d.at_css(".title-third").attr("href").split('?').first,
349
+ "pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
347
350
  # "description" => d.css('.publications__body')[0]&.text&.strip,
348
351
  # "author" => d.css('.publications__body')[1]&.text&.strip,
349
352
  }.compact
data/exe/bipm-fetch-cgpm CHANGED
@@ -1,71 +1,3 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/bipm-data-importer'
4
-
5
- BASE_DIR = "data"
6
- a = Mechanize.new
7
-
8
- meetings_en = VCR.use_cassette 'cgpm-meetings' do
9
- a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
10
- end
11
-
12
- meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
13
- a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
14
- end
15
-
16
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
17
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
18
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
19
-
20
- [['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
21
- urls = meetings.css('div.publications__content').map do |option|
22
- url = option.at_css('a').attr('href')
23
- url = url.gsub('/web/guest/', "/#{meeting_lang}/")
24
- url.split('/').first(8).join('/')
25
- end.uniq
26
-
27
- urls.each do |url|
28
- meeting_id = url.split('/').last.to_i
29
- meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
30
- meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
31
- meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
32
-
33
- title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
34
- date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
35
-
36
- pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
37
-
38
- h = {
39
- "metadata" => {
40
- "title" => title,
41
- "identifier" => meeting_id,
42
- "date" => date.to_s,
43
- "source" => "BIPM - Pavillon de Breteuil",
44
- "url" => meeting.uri.to_s
45
- }
46
- }
47
-
48
- h["pdf"] = pdf if pdf
49
-
50
- resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
51
-
52
- # A mistake on a website, resolution 5 listed 4 times...
53
- # https://www.bipm.org/fr/committees/cg/cgpm/8-1933
54
- if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
55
- resolutions = (1..15).map do |i|
56
- "https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
57
- end
58
- end
59
-
60
- h["resolutions"] = resolutions.map do |res_link|
61
- res_id = (res_link.split('-')[2] || 0).to_i
62
- res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
63
- res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
64
-
65
- Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
66
- end
67
-
68
- FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
69
- File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
70
- end
71
- end
3
+ warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
@@ -1,5 +1,5 @@
1
1
  require 'mechanize'
2
- require 'reverse_adoc'
2
+ require 'coradoc/input/html'
3
3
  require 'vcr'
4
4
  require 'date'
5
5
  require 'fileutils'
@@ -50,7 +50,7 @@ module Bipm
50
50
  /(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
51
51
  /(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
52
52
  /(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
53
- /(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
53
+ /(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
54
54
  /(?:resolve[sd]?)/i => "resolves",
55
55
  /(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
56
56
  /(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
@@ -90,6 +90,8 @@ module Bipm
90
90
 
91
91
  SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
92
 
93
+ DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
94
+
93
95
  module Common
94
96
  def replace_links ps, res, lang
95
97
  ps.css('a[href]').each do |a|
@@ -164,8 +166,13 @@ module Bipm
164
166
 
165
167
  def format_message part
166
168
  AsciiMath.asciidoc_extract_math(
167
- ReverseAdoc.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
169
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
170
  )
171
+ rescue
172
+ warn "Bug in Coradoc, couldn't parse the following document:"
173
+ pp part
174
+ warn "Please report this as an issue to https://github.com/metanorma/coradoc"
175
+ raise
169
176
  end
170
177
 
171
178
  def ng_to_string ps
@@ -247,6 +254,11 @@ module Bipm
247
254
  doc = Common.ng_to_string(ps)
248
255
  # doc = AsciiMath.html_to_asciimath(doc)
249
256
 
257
+ if doc.match? DOIREGEX
258
+ doc = doc.sub(DOIREGEX, '')
259
+ r["doi"] = $1
260
+ end
261
+
250
262
  parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
251
263
  nparts = [parts.shift]
252
264
  while parts.length > 0
@@ -389,6 +401,8 @@ module Bipm
389
401
  end
390
402
 
391
403
  def extract_date(date_str)
404
+ return nil unless date_str
405
+
392
406
  date = date_str.strip
393
407
  .gsub(/\s+/, ' ')
394
408
  .gsub("février", "february") # 3 first letters must match English
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.1.3"
6
+ VERSION = "0.2.0"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-01-06 00:00:00.000000000 Z
11
+ date: 2024-12-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: reverse_adoc
42
+ name: coradoc
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
206
  - !ruby/object:Gem::Version
207
207
  version: '0'
208
208
  requirements: []
209
- rubygems_version: 3.3.26
209
+ rubygems_version: 3.3.27
210
210
  signing_key:
211
211
  specification_version: 4
212
212
  summary: Importer for BIPM CGPM and CIPM content