bipm-data-importer 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fcde8a899380a032040ac0f44f380edda5b0f0a6c6591dc874a9e4f90504544
4
- data.tar.gz: ba4cce57619f4356eb66309d75347fa24db76ba67936fb0e0cc161b97745c13e
3
+ metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
4
+ data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
5
5
  SHA512:
6
- metadata.gz: db266f37ca351605df8e13ea5c75fae2fe9b9ddbd8038e4392233e2e103d28e1a33c11a7f1bf397c93a8e5f32afdb293045341a776fd09f16b4cae5a054ac340
7
- data.tar.gz: 46bec77135211aec51a90a03f21dbc72f9cc1443108b965be211449f89b963d90f87a58e354a5a33ec9a750fa0c0ea6357382f4ecbf1f76434c54bd3b3818a0f
6
+ metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
7
+ data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  spec.add_dependency "nokogiri"
32
32
  spec.add_dependency "mechanize"
33
- spec.add_dependency "reverse_adoc"
33
+ spec.add_dependency "coradoc"
34
34
  spec.add_dependency "pry"
35
35
 
36
36
  spec.add_dependency "vcr"
data/exe/bipm-fetch CHANGED
@@ -15,7 +15,8 @@ bodies = {
15
15
  "CCL": 'https://www.bipm.org/en/committees/cc/ccl',
16
16
  "CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
17
17
  "CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
18
- "CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
18
+ "CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
19
+ "CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
19
20
  }
20
21
 
21
22
  BASE_DIR = "data"
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
64
65
 
65
66
  title = meeting_div.at_css('.meetings-list__informations-title').text.strip
66
67
  href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
68
+ href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
67
69
 
68
70
  ident = href.split("/#{body}/").last.gsub('/', '.')
69
71
  yr = href.include?("/wg/") ? nil : href.split('-').last
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
124
126
 
125
127
  h["resolutions"] = resolutions.map do |href|
126
128
  href = href.gsub('/web/guest/', "/#{meeting_lang}/")
129
+ href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
127
130
 
128
131
  # error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
129
132
  href = href.gsub('/104-2015/', '/104-_1-2015/')
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
343
346
  "documents" => i.css(".publications__content").map do |d|
344
347
  {
345
348
  "title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
346
- "pdf" => d.at_css(".title-third").attr("href").split('?').first,
349
+ "pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
347
350
  # "description" => d.css('.publications__body')[0]&.text&.strip,
348
351
  # "author" => d.css('.publications__body')[1]&.text&.strip,
349
352
  }.compact
data/exe/bipm-fetch-cgpm CHANGED
@@ -1,71 +1,3 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/bipm-data-importer'
4
-
5
- BASE_DIR = "data"
6
- a = Mechanize.new
7
-
8
- meetings_en = VCR.use_cassette 'cgpm-meetings' do
9
- a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
10
- end
11
-
12
- meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
13
- a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
14
- end
15
-
16
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
17
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
18
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
19
-
20
- [['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
21
- urls = meetings.css('div.publications__content').map do |option|
22
- url = option.at_css('a').attr('href')
23
- url = url.gsub('/web/guest/', "/#{meeting_lang}/")
24
- url.split('/').first(8).join('/')
25
- end.uniq
26
-
27
- urls.each do |url|
28
- meeting_id = url.split('/').last.to_i
29
- meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
30
- meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
31
- meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
32
-
33
- title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
34
- date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
35
-
36
- pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
37
-
38
- h = {
39
- "metadata" => {
40
- "title" => title,
41
- "identifier" => meeting_id,
42
- "date" => date.to_s,
43
- "source" => "BIPM - Pavillon de Breteuil",
44
- "url" => meeting.uri.to_s
45
- }
46
- }
47
-
48
- h["pdf"] = pdf if pdf
49
-
50
- resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
51
-
52
- # A mistake on a website, resolution 5 listed 4 times...
53
- # https://www.bipm.org/fr/committees/cg/cgpm/8-1933
54
- if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
55
- resolutions = (1..15).map do |i|
56
- "https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
57
- end
58
- end
59
-
60
- h["resolutions"] = resolutions.map do |res_link|
61
- res_id = (res_link.split('-')[2] || 0).to_i
62
- res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
63
- res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
64
-
65
- Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
66
- end
67
-
68
- FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
69
- File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
70
- end
71
- end
3
+ warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
@@ -1,5 +1,5 @@
1
1
  require 'mechanize'
2
- require 'reverse_adoc'
2
+ require 'coradoc/input/html'
3
3
  require 'vcr'
4
4
  require 'date'
5
5
  require 'fileutils'
@@ -50,7 +50,7 @@ module Bipm
50
50
  /(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
51
51
  /(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
52
52
  /(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
53
- /(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
53
+ /(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
54
54
  /(?:resolve[sd]?)/i => "resolves",
55
55
  /(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
56
56
  /(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
@@ -90,6 +90,8 @@ module Bipm
90
90
 
91
91
  SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
92
 
93
+ DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
94
+
93
95
  module Common
94
96
  def replace_links ps, res, lang
95
97
  ps.css('a[href]').each do |a|
@@ -164,8 +166,13 @@ module Bipm
164
166
 
165
167
  def format_message part
166
168
  AsciiMath.asciidoc_extract_math(
167
- ReverseAdoc.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
169
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
170
  )
171
+ rescue
172
+ warn "Bug in Coradoc, couldn't parse the following document:"
173
+ pp part
174
+ warn "Please report this as an issue to https://github.com/metanorma/coradoc"
175
+ raise
169
176
  end
170
177
 
171
178
  def ng_to_string ps
@@ -247,6 +254,11 @@ module Bipm
247
254
  doc = Common.ng_to_string(ps)
248
255
  # doc = AsciiMath.html_to_asciimath(doc)
249
256
 
257
+ if doc.match? DOIREGEX
258
+ doc = doc.sub(DOIREGEX, '')
259
+ r["doi"] = $1
260
+ end
261
+
250
262
  parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
251
263
  nparts = [parts.shift]
252
264
  while parts.length > 0
@@ -389,6 +401,8 @@ module Bipm
389
401
  end
390
402
 
391
403
  def extract_date(date_str)
404
+ return nil unless date_str
405
+
392
406
  date = date_str.strip
393
407
  .gsub(/\s+/, ' ')
394
408
  .gsub("février", "february") # 3 first letters must match English
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.1.3"
6
+ VERSION = "0.2.0"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-01-06 00:00:00.000000000 Z
11
+ date: 2024-12-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: reverse_adoc
42
+ name: coradoc
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
206
  - !ruby/object:Gem::Version
207
207
  version: '0'
208
208
  requirements: []
209
- rubygems_version: 3.3.26
209
+ rubygems_version: 3.3.27
210
210
  signing_key:
211
211
  specification_version: 4
212
212
  summary: Importer for BIPM CGPM and CIPM content