bipm-data-importer 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bipm-data-importer.gemspec +1 -1
- data/exe/bipm-fetch +5 -2
- data/exe/bipm-fetch-cgpm +1 -69
- data/lib/bipm/data/importer/common.rb +17 -3
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
|
4
|
+
data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
|
7
|
+
data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
|
data/bipm-data-importer.gemspec
CHANGED
data/exe/bipm-fetch
CHANGED
@@ -15,7 +15,8 @@ bodies = {
|
|
15
15
|
"CCL": 'https://www.bipm.org/en/committees/cc/ccl',
|
16
16
|
"CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
|
17
17
|
"CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
|
18
|
-
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
|
18
|
+
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
|
19
|
+
"CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
|
19
20
|
}
|
20
21
|
|
21
22
|
BASE_DIR = "data"
|
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
|
|
64
65
|
|
65
66
|
title = meeting_div.at_css('.meetings-list__informations-title').text.strip
|
66
67
|
href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
|
68
|
+
href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
|
67
69
|
|
68
70
|
ident = href.split("/#{body}/").last.gsub('/', '.')
|
69
71
|
yr = href.include?("/wg/") ? nil : href.split('-').last
|
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
|
|
124
126
|
|
125
127
|
h["resolutions"] = resolutions.map do |href|
|
126
128
|
href = href.gsub('/web/guest/', "/#{meeting_lang}/")
|
129
|
+
href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
|
127
130
|
|
128
131
|
# error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
|
129
132
|
href = href.gsub('/104-2015/', '/104-_1-2015/')
|
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
|
|
343
346
|
"documents" => i.css(".publications__content").map do |d|
|
344
347
|
{
|
345
348
|
"title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
|
346
|
-
"pdf" => d.at_css(".title-third")
|
349
|
+
"pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
|
347
350
|
# "description" => d.css('.publications__body')[0]&.text&.strip,
|
348
351
|
# "author" => d.css('.publications__body')[1]&.text&.strip,
|
349
352
|
}.compact
|
data/exe/bipm-fetch-cgpm
CHANGED
@@ -1,71 +1,3 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
BASE_DIR = "data"
|
6
|
-
a = Mechanize.new
|
7
|
-
|
8
|
-
meetings_en = VCR.use_cassette 'cgpm-meetings' do
|
9
|
-
a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
|
10
|
-
end
|
11
|
-
|
12
|
-
meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
|
13
|
-
a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
|
14
|
-
end
|
15
|
-
|
16
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
|
17
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
|
18
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
|
19
|
-
|
20
|
-
[['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
|
21
|
-
urls = meetings.css('div.publications__content').map do |option|
|
22
|
-
url = option.at_css('a').attr('href')
|
23
|
-
url = url.gsub('/web/guest/', "/#{meeting_lang}/")
|
24
|
-
url.split('/').first(8).join('/')
|
25
|
-
end.uniq
|
26
|
-
|
27
|
-
urls.each do |url|
|
28
|
-
meeting_id = url.split('/').last.to_i
|
29
|
-
meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
|
30
|
-
meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
|
31
|
-
meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
|
32
|
-
|
33
|
-
title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
|
34
|
-
date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
|
35
|
-
|
36
|
-
pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
|
37
|
-
|
38
|
-
h = {
|
39
|
-
"metadata" => {
|
40
|
-
"title" => title,
|
41
|
-
"identifier" => meeting_id,
|
42
|
-
"date" => date.to_s,
|
43
|
-
"source" => "BIPM - Pavillon de Breteuil",
|
44
|
-
"url" => meeting.uri.to_s
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
h["pdf"] = pdf if pdf
|
49
|
-
|
50
|
-
resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
|
51
|
-
|
52
|
-
# A mistake on a website, resolution 5 listed 4 times...
|
53
|
-
# https://www.bipm.org/fr/committees/cg/cgpm/8-1933
|
54
|
-
if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
|
55
|
-
resolutions = (1..15).map do |i|
|
56
|
-
"https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
h["resolutions"] = resolutions.map do |res_link|
|
61
|
-
res_id = (res_link.split('-')[2] || 0).to_i
|
62
|
-
res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
|
63
|
-
res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
|
64
|
-
|
65
|
-
Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
|
66
|
-
end
|
67
|
-
|
68
|
-
FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
|
69
|
-
File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
|
70
|
-
end
|
71
|
-
end
|
3
|
+
warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'mechanize'
|
2
|
-
require '
|
2
|
+
require 'coradoc/input/html'
|
3
3
|
require 'vcr'
|
4
4
|
require 'date'
|
5
5
|
require 'fileutils'
|
@@ -50,7 +50,7 @@ module Bipm
|
|
50
50
|
/(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
|
51
51
|
/(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
|
52
52
|
/(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
|
53
|
-
/(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
53
|
+
/(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
54
54
|
/(?:resolve[sd]?)/i => "resolves",
|
55
55
|
/(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
|
56
56
|
/(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
|
@@ -90,6 +90,8 @@ module Bipm
|
|
90
90
|
|
91
91
|
SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
92
92
|
|
93
|
+
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
94
|
+
|
93
95
|
module Common
|
94
96
|
def replace_links ps, res, lang
|
95
97
|
ps.css('a[href]').each do |a|
|
@@ -164,8 +166,13 @@ module Bipm
|
|
164
166
|
|
165
167
|
def format_message part
|
166
168
|
AsciiMath.asciidoc_extract_math(
|
167
|
-
|
169
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", ' ').gsub(" \n", "\n")
|
168
170
|
)
|
171
|
+
rescue
|
172
|
+
warn "Bug in Coradoc, couldn't parse the following document:"
|
173
|
+
pp part
|
174
|
+
warn "Please report this as an issue to https://github.com/metanorma/coradoc"
|
175
|
+
raise
|
169
176
|
end
|
170
177
|
|
171
178
|
def ng_to_string ps
|
@@ -247,6 +254,11 @@ module Bipm
|
|
247
254
|
doc = Common.ng_to_string(ps)
|
248
255
|
# doc = AsciiMath.html_to_asciimath(doc)
|
249
256
|
|
257
|
+
if doc.match? DOIREGEX
|
258
|
+
doc = doc.sub(DOIREGEX, '')
|
259
|
+
r["doi"] = $1
|
260
|
+
end
|
261
|
+
|
250
262
|
parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
|
251
263
|
nparts = [parts.shift]
|
252
264
|
while parts.length > 0
|
@@ -389,6 +401,8 @@ module Bipm
|
|
389
401
|
end
|
390
402
|
|
391
403
|
def extract_date(date_str)
|
404
|
+
return nil unless date_str
|
405
|
+
|
392
406
|
date = date_str.strip
|
393
407
|
.gsub(/\s+/, ' ')
|
394
408
|
.gsub("février", "february") # 3 first letters must match English
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bipm-data-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coradoc
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
206
|
- !ruby/object:Gem::Version
|
207
207
|
version: '0'
|
208
208
|
requirements: []
|
209
|
-
rubygems_version: 3.3.
|
209
|
+
rubygems_version: 3.3.27
|
210
210
|
signing_key:
|
211
211
|
specification_version: 4
|
212
212
|
summary: Importer for BIPM CGPM and CIPM content
|