bipm-data-importer 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bipm-data-importer.gemspec +1 -1
- data/exe/bipm-fetch +5 -2
- data/exe/bipm-fetch-cgpm +1 -69
- data/lib/bipm/data/importer/common.rb +17 -3
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
|
4
|
+
data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
|
7
|
+
data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
|
data/bipm-data-importer.gemspec
CHANGED
data/exe/bipm-fetch
CHANGED
@@ -15,7 +15,8 @@ bodies = {
|
|
15
15
|
"CCL": 'https://www.bipm.org/en/committees/cc/ccl',
|
16
16
|
"CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
|
17
17
|
"CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
|
18
|
-
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
|
18
|
+
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
|
19
|
+
"CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
|
19
20
|
}
|
20
21
|
|
21
22
|
BASE_DIR = "data"
|
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
|
|
64
65
|
|
65
66
|
title = meeting_div.at_css('.meetings-list__informations-title').text.strip
|
66
67
|
href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
|
68
|
+
href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
|
67
69
|
|
68
70
|
ident = href.split("/#{body}/").last.gsub('/', '.')
|
69
71
|
yr = href.include?("/wg/") ? nil : href.split('-').last
|
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
|
|
124
126
|
|
125
127
|
h["resolutions"] = resolutions.map do |href|
|
126
128
|
href = href.gsub('/web/guest/', "/#{meeting_lang}/")
|
129
|
+
href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
|
127
130
|
|
128
131
|
# error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
|
129
132
|
href = href.gsub('/104-2015/', '/104-_1-2015/')
|
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
|
|
343
346
|
"documents" => i.css(".publications__content").map do |d|
|
344
347
|
{
|
345
348
|
"title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
|
346
|
-
"pdf" => d.at_css(".title-third")
|
349
|
+
"pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
|
347
350
|
# "description" => d.css('.publications__body')[0]&.text&.strip,
|
348
351
|
# "author" => d.css('.publications__body')[1]&.text&.strip,
|
349
352
|
}.compact
|
data/exe/bipm-fetch-cgpm
CHANGED
@@ -1,71 +1,3 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
BASE_DIR = "data"
|
6
|
-
a = Mechanize.new
|
7
|
-
|
8
|
-
meetings_en = VCR.use_cassette 'cgpm-meetings' do
|
9
|
-
a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
|
10
|
-
end
|
11
|
-
|
12
|
-
meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
|
13
|
-
a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
|
14
|
-
end
|
15
|
-
|
16
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
|
17
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
|
18
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
|
19
|
-
|
20
|
-
[['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
|
21
|
-
urls = meetings.css('div.publications__content').map do |option|
|
22
|
-
url = option.at_css('a').attr('href')
|
23
|
-
url = url.gsub('/web/guest/', "/#{meeting_lang}/")
|
24
|
-
url.split('/').first(8).join('/')
|
25
|
-
end.uniq
|
26
|
-
|
27
|
-
urls.each do |url|
|
28
|
-
meeting_id = url.split('/').last.to_i
|
29
|
-
meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
|
30
|
-
meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
|
31
|
-
meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
|
32
|
-
|
33
|
-
title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
|
34
|
-
date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
|
35
|
-
|
36
|
-
pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
|
37
|
-
|
38
|
-
h = {
|
39
|
-
"metadata" => {
|
40
|
-
"title" => title,
|
41
|
-
"identifier" => meeting_id,
|
42
|
-
"date" => date.to_s,
|
43
|
-
"source" => "BIPM - Pavillon de Breteuil",
|
44
|
-
"url" => meeting.uri.to_s
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
h["pdf"] = pdf if pdf
|
49
|
-
|
50
|
-
resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
|
51
|
-
|
52
|
-
# A mistake on a website, resolution 5 listed 4 times...
|
53
|
-
# https://www.bipm.org/fr/committees/cg/cgpm/8-1933
|
54
|
-
if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
|
55
|
-
resolutions = (1..15).map do |i|
|
56
|
-
"https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
h["resolutions"] = resolutions.map do |res_link|
|
61
|
-
res_id = (res_link.split('-')[2] || 0).to_i
|
62
|
-
res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
|
63
|
-
res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
|
64
|
-
|
65
|
-
Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
|
66
|
-
end
|
67
|
-
|
68
|
-
FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
|
69
|
-
File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
|
70
|
-
end
|
71
|
-
end
|
3
|
+
warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'mechanize'
|
2
|
-
require '
|
2
|
+
require 'coradoc/input/html'
|
3
3
|
require 'vcr'
|
4
4
|
require 'date'
|
5
5
|
require 'fileutils'
|
@@ -50,7 +50,7 @@ module Bipm
|
|
50
50
|
/(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
|
51
51
|
/(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
|
52
52
|
/(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
|
53
|
-
/(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
53
|
+
/(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
54
54
|
/(?:resolve[sd]?)/i => "resolves",
|
55
55
|
/(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
|
56
56
|
/(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
|
@@ -90,6 +90,8 @@ module Bipm
|
|
90
90
|
|
91
91
|
SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
92
92
|
|
93
|
+
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
94
|
+
|
93
95
|
module Common
|
94
96
|
def replace_links ps, res, lang
|
95
97
|
ps.css('a[href]').each do |a|
|
@@ -164,8 +166,13 @@ module Bipm
|
|
164
166
|
|
165
167
|
def format_message part
|
166
168
|
AsciiMath.asciidoc_extract_math(
|
167
|
-
|
169
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", ' ').gsub(" \n", "\n")
|
168
170
|
)
|
171
|
+
rescue
|
172
|
+
warn "Bug in Coradoc, couldn't parse the following document:"
|
173
|
+
pp part
|
174
|
+
warn "Please report this as an issue to https://github.com/metanorma/coradoc"
|
175
|
+
raise
|
169
176
|
end
|
170
177
|
|
171
178
|
def ng_to_string ps
|
@@ -247,6 +254,11 @@ module Bipm
|
|
247
254
|
doc = Common.ng_to_string(ps)
|
248
255
|
# doc = AsciiMath.html_to_asciimath(doc)
|
249
256
|
|
257
|
+
if doc.match? DOIREGEX
|
258
|
+
doc = doc.sub(DOIREGEX, '')
|
259
|
+
r["doi"] = $1
|
260
|
+
end
|
261
|
+
|
250
262
|
parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
|
251
263
|
nparts = [parts.shift]
|
252
264
|
while parts.length > 0
|
@@ -389,6 +401,8 @@ module Bipm
|
|
389
401
|
end
|
390
402
|
|
391
403
|
def extract_date(date_str)
|
404
|
+
return nil unless date_str
|
405
|
+
|
392
406
|
date = date_str.strip
|
393
407
|
.gsub(/\s+/, ' ')
|
394
408
|
.gsub("février", "february") # 3 first letters must match English
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bipm-data-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coradoc
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
206
|
- !ruby/object:Gem::Version
|
207
207
|
version: '0'
|
208
208
|
requirements: []
|
209
|
-
rubygems_version: 3.3.
|
209
|
+
rubygems_version: 3.3.27
|
210
210
|
signing_key:
|
211
211
|
specification_version: 4
|
212
212
|
summary: Importer for BIPM CGPM and CIPM content
|