bipm-data-importer 0.1.3 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bipm-data-importer.gemspec +1 -1
- data/exe/bipm-fetch +5 -2
- data/exe/bipm-fetch-cgpm +1 -69
- data/lib/bipm/data/importer/common.rb +94 -82
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
|
4
|
+
data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
|
7
|
+
data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
|
data/bipm-data-importer.gemspec
CHANGED
data/exe/bipm-fetch
CHANGED
@@ -15,7 +15,8 @@ bodies = {
|
|
15
15
|
"CCL": 'https://www.bipm.org/en/committees/cc/ccl',
|
16
16
|
"CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
|
17
17
|
"CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
|
18
|
-
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
|
18
|
+
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
|
19
|
+
"CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
|
19
20
|
}
|
20
21
|
|
21
22
|
BASE_DIR = "data"
|
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
|
|
64
65
|
|
65
66
|
title = meeting_div.at_css('.meetings-list__informations-title').text.strip
|
66
67
|
href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
|
68
|
+
href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
|
67
69
|
|
68
70
|
ident = href.split("/#{body}/").last.gsub('/', '.')
|
69
71
|
yr = href.include?("/wg/") ? nil : href.split('-').last
|
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
|
|
124
126
|
|
125
127
|
h["resolutions"] = resolutions.map do |href|
|
126
128
|
href = href.gsub('/web/guest/', "/#{meeting_lang}/")
|
129
|
+
href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
|
127
130
|
|
128
131
|
# error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
|
129
132
|
href = href.gsub('/104-2015/', '/104-_1-2015/')
|
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
|
|
343
346
|
"documents" => i.css(".publications__content").map do |d|
|
344
347
|
{
|
345
348
|
"title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
|
346
|
-
"pdf" => d.at_css(".title-third")
|
349
|
+
"pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
|
347
350
|
# "description" => d.css('.publications__body')[0]&.text&.strip,
|
348
351
|
# "author" => d.css('.publications__body')[1]&.text&.strip,
|
349
352
|
}.compact
|
data/exe/bipm-fetch-cgpm
CHANGED
@@ -1,71 +1,3 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
BASE_DIR = "data"
|
6
|
-
a = Mechanize.new
|
7
|
-
|
8
|
-
meetings_en = VCR.use_cassette 'cgpm-meetings' do
|
9
|
-
a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
|
10
|
-
end
|
11
|
-
|
12
|
-
meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
|
13
|
-
a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
|
14
|
-
end
|
15
|
-
|
16
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
|
17
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
|
18
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
|
19
|
-
|
20
|
-
[['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
|
21
|
-
urls = meetings.css('div.publications__content').map do |option|
|
22
|
-
url = option.at_css('a').attr('href')
|
23
|
-
url = url.gsub('/web/guest/', "/#{meeting_lang}/")
|
24
|
-
url.split('/').first(8).join('/')
|
25
|
-
end.uniq
|
26
|
-
|
27
|
-
urls.each do |url|
|
28
|
-
meeting_id = url.split('/').last.to_i
|
29
|
-
meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
|
30
|
-
meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
|
31
|
-
meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
|
32
|
-
|
33
|
-
title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
|
34
|
-
date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
|
35
|
-
|
36
|
-
pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
|
37
|
-
|
38
|
-
h = {
|
39
|
-
"metadata" => {
|
40
|
-
"title" => title,
|
41
|
-
"identifier" => meeting_id,
|
42
|
-
"date" => date.to_s,
|
43
|
-
"source" => "BIPM - Pavillon de Breteuil",
|
44
|
-
"url" => meeting.uri.to_s
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
h["pdf"] = pdf if pdf
|
49
|
-
|
50
|
-
resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
|
51
|
-
|
52
|
-
# A mistake on a website, resolution 5 listed 4 times...
|
53
|
-
# https://www.bipm.org/fr/committees/cg/cgpm/8-1933
|
54
|
-
if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
|
55
|
-
resolutions = (1..15).map do |i|
|
56
|
-
"https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
h["resolutions"] = resolutions.map do |res_link|
|
61
|
-
res_id = (res_link.split('-')[2] || 0).to_i
|
62
|
-
res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
|
63
|
-
res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
|
64
|
-
|
65
|
-
Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
|
66
|
-
end
|
67
|
-
|
68
|
-
FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
|
69
|
-
File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
|
70
|
-
end
|
71
|
-
end
|
3
|
+
warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
|
@@ -1,20 +1,19 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require_relative
|
1
|
+
require "mechanize"
|
2
|
+
require "coradoc/input/html"
|
3
|
+
require "vcr"
|
4
|
+
require "date"
|
5
|
+
require "fileutils"
|
6
|
+
require "pry"
|
7
|
+
require_relative "asciimath"
|
8
8
|
|
9
9
|
VCR.configure do |c|
|
10
|
-
c.cassette_library_dir = __dir__+
|
10
|
+
c.cassette_library_dir = __dir__ + "/../../../../cassettes"
|
11
11
|
c.hook_into :webmock
|
12
12
|
end
|
13
13
|
|
14
14
|
module Bipm
|
15
15
|
module Data
|
16
16
|
module Importer
|
17
|
-
|
18
17
|
CONSIDERATIONS = {
|
19
18
|
/(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
|
20
19
|
/(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
|
@@ -50,7 +49,7 @@ module Bipm
|
|
50
49
|
/(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
|
51
50
|
/(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
|
52
51
|
/(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
|
53
|
-
/(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
52
|
+
/(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
54
53
|
/(?:resolve[sd]?)/i => "resolves",
|
55
54
|
/(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
|
56
55
|
/(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
|
@@ -79,50 +78,52 @@ module Bipm
|
|
79
78
|
/(?:empowers|habilite)/i => "empowers",
|
80
79
|
}
|
81
80
|
|
82
|
-
PREFIX1
|
83
|
-
PREFIX2
|
84
|
-
PREFIX3
|
85
|
-
PREFIX4
|
86
|
-
PREFIX5
|
87
|
-
PREFIX6
|
81
|
+
PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
|
82
|
+
PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
|
83
|
+
PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
|
84
|
+
PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
|
85
|
+
PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
|
86
|
+
PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
|
87
|
+
|
88
|
+
PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
|
88
89
|
|
89
|
-
|
90
|
+
SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
90
91
|
|
91
|
-
|
92
|
+
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
92
93
|
|
93
94
|
module Common
|
94
|
-
def replace_links
|
95
|
-
ps.css(
|
96
|
-
href = a.attr(
|
95
|
+
def replace_links(ps, res, lang)
|
96
|
+
ps.css("a[href]").each do |a|
|
97
|
+
href = a.attr("href")
|
97
98
|
|
98
|
-
href = href.gsub(%r'\Ahttps://www
|
99
|
+
href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
|
99
100
|
|
100
101
|
# Correct links
|
101
|
-
href = href.gsub(
|
102
|
+
href = href.gsub("/web/guest/", "/#{lang}/")
|
102
103
|
|
103
104
|
# Account for some mistakes from an upstream document
|
104
|
-
href = href.gsub(%r"\A/jen/",
|
105
|
-
href = href.gsub(%r"\A/en/CGPM/jsp/",
|
105
|
+
href = href.gsub(%r"\A/jen/", "/en/")
|
106
|
+
href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
|
106
107
|
|
107
108
|
href = case href
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
109
|
+
when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
|
110
|
+
%r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
|
111
|
+
%r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
|
112
|
+
"cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
113
|
+
when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
|
114
|
+
"cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
115
|
+
when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
|
116
|
+
"cipm-decisions:#{$1}/#{$2}#{$3}"
|
117
|
+
else
|
118
|
+
URI(res.uri).merge(href).to_s # Relative -> absolute
|
119
|
+
end
|
119
120
|
|
120
|
-
a.set_attribute(
|
121
|
+
a.set_attribute("href", href)
|
121
122
|
end
|
122
123
|
end
|
123
124
|
|
124
|
-
def replace_centers
|
125
|
-
centers = ps.css(
|
125
|
+
def replace_centers(ps)
|
126
|
+
centers = ps.css("center").to_a
|
126
127
|
while centers.length > 0
|
127
128
|
center = centers.first
|
128
129
|
current = center
|
@@ -131,7 +132,7 @@ module Bipm
|
|
131
132
|
break unless current.next
|
132
133
|
while Nokogiri::XML::Text === current.next
|
133
134
|
current = current.next
|
134
|
-
break if current.text.strip !=
|
135
|
+
break if current.text.strip != ""
|
135
136
|
end
|
136
137
|
break unless current.next
|
137
138
|
break unless current.next.name == "center"
|
@@ -157,47 +158,52 @@ module Bipm
|
|
157
158
|
end
|
158
159
|
|
159
160
|
# Remove the remaining centers
|
160
|
-
ps.css(
|
161
|
+
ps.css("center").each do |i|
|
161
162
|
i.replace i.inner_html
|
162
163
|
end
|
163
164
|
end
|
164
165
|
|
165
|
-
def format_message
|
166
|
+
def format_message(part)
|
166
167
|
AsciiMath.asciidoc_extract_math(
|
167
|
-
|
168
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", " ").gsub(" \n", "\n")
|
168
169
|
)
|
170
|
+
rescue
|
171
|
+
warn "Bug in Coradoc, couldn't parse the following document:"
|
172
|
+
pp part
|
173
|
+
warn "Please report this as an issue to https://github.com/metanorma/coradoc"
|
174
|
+
raise
|
169
175
|
end
|
170
176
|
|
171
|
-
def ng_to_string
|
172
|
-
ps.inner_html.encode(
|
177
|
+
def ng_to_string(ps)
|
178
|
+
ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
|
173
179
|
end
|
174
180
|
|
175
|
-
def parse_resolution
|
181
|
+
def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
|
176
182
|
# Reparse the document after fixing upstream syntax
|
177
183
|
fixed_body = res.body.gsub("<name=", "<a name=")
|
178
|
-
fixed_body = fixed_body.force_encoding(
|
179
|
-
fixed_body = fixed_body.gsub(
|
180
|
-
fixed_body = fixed_body.gsub(
|
181
|
-
fixed_body = fixed_body.gsub(
|
182
|
-
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m,
|
184
|
+
fixed_body = fixed_body.force_encoding("utf-8")
|
185
|
+
fixed_body = fixed_body.gsub("É", "É")
|
186
|
+
fixed_body = fixed_body.gsub("« ", "« ")
|
187
|
+
fixed_body = fixed_body.gsub(" »", " »")
|
188
|
+
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
|
183
189
|
supertitle = $1.strip
|
184
|
-
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m,
|
190
|
+
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
|
185
191
|
title = $1.strip
|
186
192
|
fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
|
187
193
|
fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
|
188
194
|
ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
|
189
195
|
|
190
|
-
refs = ng.css(
|
196
|
+
refs = ng.css(".publication-card_reference a")
|
191
197
|
|
192
198
|
if rec_type.end_with? "?"
|
193
199
|
rec_type = case supertitle
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
200
|
+
when /\AD[eé]claration/
|
201
|
+
"statement"
|
202
|
+
when /\AR[eé]solution/
|
203
|
+
"resolution"
|
204
|
+
else
|
205
|
+
rec_type[..-2]
|
206
|
+
end
|
201
207
|
end
|
202
208
|
|
203
209
|
r = {
|
@@ -220,7 +226,7 @@ module Bipm
|
|
220
226
|
r.delete("type") unless r["type"]
|
221
227
|
|
222
228
|
if refs.length > 0
|
223
|
-
r["reference"] = res.uri.merge(refs.first.attr(
|
229
|
+
r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
|
224
230
|
name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
|
225
231
|
r["reference_name"] = name
|
226
232
|
if page
|
@@ -234,7 +240,7 @@ module Bipm
|
|
234
240
|
r.delete("reference_page")
|
235
241
|
end
|
236
242
|
|
237
|
-
ps = ng.css(
|
243
|
+
ps = ng.css("div.journal-content-article").first
|
238
244
|
|
239
245
|
#binding.pry if ps.count != 1
|
240
246
|
|
@@ -247,6 +253,11 @@ module Bipm
|
|
247
253
|
doc = Common.ng_to_string(ps)
|
248
254
|
# doc = AsciiMath.html_to_asciimath(doc)
|
249
255
|
|
256
|
+
if doc.match? DOIREGEX
|
257
|
+
doc = doc.sub(DOIREGEX, "")
|
258
|
+
r["doi"] = $1
|
259
|
+
end
|
260
|
+
|
250
261
|
parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
|
251
262
|
nparts = [parts.shift]
|
252
263
|
while parts.length > 0
|
@@ -267,13 +278,13 @@ module Bipm
|
|
267
278
|
next
|
268
279
|
end
|
269
280
|
|
270
|
-
if parse.start_with?
|
271
|
-
part = part.sub(
|
281
|
+
if parse.start_with? "NOTE"
|
282
|
+
part = part.sub("<h3>NOTE</h3>", "")
|
272
283
|
r["notes"] = Common.format_message(part)
|
273
284
|
next
|
274
285
|
end
|
275
286
|
|
276
|
-
CONSIDERATIONS.any? do |k,v|
|
287
|
+
CONSIDERATIONS.any? do |k, v|
|
277
288
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
278
289
|
r["considerations"] << prev = {
|
279
290
|
"type" => v,
|
@@ -283,7 +294,7 @@ module Bipm
|
|
283
294
|
end
|
284
295
|
end && next
|
285
296
|
|
286
|
-
ACTIONS.any? do |k,v|
|
297
|
+
ACTIONS.any? do |k, v|
|
287
298
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
288
299
|
r["actions"] << prev = {
|
289
300
|
"type" => v,
|
@@ -314,13 +325,13 @@ module Bipm
|
|
314
325
|
end
|
315
326
|
|
316
327
|
%w[considerations actions].each do |type|
|
317
|
-
map = type ==
|
328
|
+
map = type == "actions" ? ACTIONS : CONSIDERATIONS
|
318
329
|
r[type] = r[type].map do |i|
|
319
330
|
islist = false
|
320
331
|
|
321
332
|
kk = nil
|
322
333
|
|
323
|
-
if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
334
|
+
if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
324
335
|
prefix = $2
|
325
336
|
suffix = $3
|
326
337
|
subject = $4
|
@@ -347,15 +358,15 @@ module Bipm
|
|
347
358
|
|
348
359
|
if subject
|
349
360
|
#p subject
|
350
|
-
r[
|
351
|
-
r[
|
361
|
+
r["subject"] ||= []
|
362
|
+
r["subject"] << subject
|
352
363
|
end
|
353
364
|
|
354
365
|
if islist
|
355
366
|
suffix = suffix.strip
|
356
|
-
suffix = nil if suffix ==
|
367
|
+
suffix = nil if suffix == ""
|
357
368
|
listitems.map do |li|
|
358
|
-
i.merge
|
369
|
+
i.merge "message" => [prefix, suffix, li].compact.join(" ")
|
359
370
|
end
|
360
371
|
else
|
361
372
|
i
|
@@ -363,13 +374,13 @@ module Bipm
|
|
363
374
|
end.flatten
|
364
375
|
end
|
365
376
|
|
366
|
-
if r[
|
367
|
-
r[
|
377
|
+
if r["subject"]
|
378
|
+
r["subject"] = r["subject"].uniq.join(" and ")
|
368
379
|
end
|
369
380
|
|
370
381
|
# Note: we replace the previously set r['subject'].
|
371
|
-
r[
|
372
|
-
r[
|
382
|
+
r["subject"] = type.to_s.upcase.gsub("-", " ")
|
383
|
+
r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
|
373
384
|
|
374
385
|
r
|
375
386
|
end
|
@@ -377,11 +388,11 @@ module Bipm
|
|
377
388
|
def extract_pdf(meeting, lang)
|
378
389
|
pdfs = meeting.css('a.title-third[href*=".pdf"]')
|
379
390
|
.map { |i| i.attr("href") }
|
380
|
-
.map { |i| i.split(
|
391
|
+
.map { |i| i.split("?").first }
|
381
392
|
.select do |i|
|
382
|
-
|
383
|
-
|
384
|
-
|
393
|
+
i.downcase.include?("-#{lang}.pdf") ||
|
394
|
+
%w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
|
395
|
+
end
|
385
396
|
|
386
397
|
pdfs = pdfs.first if pdfs.length <= 1
|
387
398
|
|
@@ -389,8 +400,10 @@ module Bipm
|
|
389
400
|
end
|
390
401
|
|
391
402
|
def extract_date(date_str)
|
403
|
+
return nil unless date_str
|
404
|
+
|
392
405
|
date = date_str.strip
|
393
|
-
.gsub(/\s+/,
|
406
|
+
.gsub(/\s+/, " ")
|
394
407
|
.gsub("février", "february") # 3 first letters must match English
|
395
408
|
.gsub("juin", "june")
|
396
409
|
.gsub("avril", "april")
|
@@ -411,7 +424,6 @@ module Bipm
|
|
411
424
|
|
412
425
|
extend self
|
413
426
|
end
|
414
|
-
|
415
427
|
end
|
416
428
|
end
|
417
429
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bipm-data-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-12-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coradoc
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
206
|
- !ruby/object:Gem::Version
|
207
207
|
version: '0'
|
208
208
|
requirements: []
|
209
|
-
rubygems_version: 3.3.
|
209
|
+
rubygems_version: 3.3.27
|
210
210
|
signing_key:
|
211
211
|
specification_version: 4
|
212
212
|
summary: Importer for BIPM CGPM and CIPM content
|