bipm-data-importer 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bipm-data-importer.gemspec +1 -1
- data/exe/bipm-fetch +5 -2
- data/exe/bipm-fetch-cgpm +1 -69
- data/lib/bipm/data/importer/common.rb +94 -82
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
|
4
|
+
data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
|
7
|
+
data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
|
data/bipm-data-importer.gemspec
CHANGED
data/exe/bipm-fetch
CHANGED
@@ -15,7 +15,8 @@ bodies = {
|
|
15
15
|
"CCL": 'https://www.bipm.org/en/committees/cc/ccl',
|
16
16
|
"CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
|
17
17
|
"CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
|
18
|
-
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
|
18
|
+
"CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
|
19
|
+
"CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
|
19
20
|
}
|
20
21
|
|
21
22
|
BASE_DIR = "data"
|
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
|
|
64
65
|
|
65
66
|
title = meeting_div.at_css('.meetings-list__informations-title').text.strip
|
66
67
|
href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
|
68
|
+
href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
|
67
69
|
|
68
70
|
ident = href.split("/#{body}/").last.gsub('/', '.')
|
69
71
|
yr = href.include?("/wg/") ? nil : href.split('-').last
|
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
|
|
124
126
|
|
125
127
|
h["resolutions"] = resolutions.map do |href|
|
126
128
|
href = href.gsub('/web/guest/', "/#{meeting_lang}/")
|
129
|
+
href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
|
127
130
|
|
128
131
|
# error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
|
129
132
|
href = href.gsub('/104-2015/', '/104-_1-2015/')
|
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
|
|
343
346
|
"documents" => i.css(".publications__content").map do |d|
|
344
347
|
{
|
345
348
|
"title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
|
346
|
-
"pdf" => d.at_css(".title-third")
|
349
|
+
"pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
|
347
350
|
# "description" => d.css('.publications__body')[0]&.text&.strip,
|
348
351
|
# "author" => d.css('.publications__body')[1]&.text&.strip,
|
349
352
|
}.compact
|
data/exe/bipm-fetch-cgpm
CHANGED
@@ -1,71 +1,3 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
BASE_DIR = "data"
|
6
|
-
a = Mechanize.new
|
7
|
-
|
8
|
-
meetings_en = VCR.use_cassette 'cgpm-meetings' do
|
9
|
-
a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
|
10
|
-
end
|
11
|
-
|
12
|
-
meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
|
13
|
-
a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
|
14
|
-
end
|
15
|
-
|
16
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
|
17
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
|
18
|
-
FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
|
19
|
-
|
20
|
-
[['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
|
21
|
-
urls = meetings.css('div.publications__content').map do |option|
|
22
|
-
url = option.at_css('a').attr('href')
|
23
|
-
url = url.gsub('/web/guest/', "/#{meeting_lang}/")
|
24
|
-
url.split('/').first(8).join('/')
|
25
|
-
end.uniq
|
26
|
-
|
27
|
-
urls.each do |url|
|
28
|
-
meeting_id = url.split('/').last.to_i
|
29
|
-
meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
|
30
|
-
meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
|
31
|
-
meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
|
32
|
-
|
33
|
-
title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
|
34
|
-
date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
|
35
|
-
|
36
|
-
pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
|
37
|
-
|
38
|
-
h = {
|
39
|
-
"metadata" => {
|
40
|
-
"title" => title,
|
41
|
-
"identifier" => meeting_id,
|
42
|
-
"date" => date.to_s,
|
43
|
-
"source" => "BIPM - Pavillon de Breteuil",
|
44
|
-
"url" => meeting.uri.to_s
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
h["pdf"] = pdf if pdf
|
49
|
-
|
50
|
-
resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
|
51
|
-
|
52
|
-
# A mistake on a website, resolution 5 listed 4 times...
|
53
|
-
# https://www.bipm.org/fr/committees/cg/cgpm/8-1933
|
54
|
-
if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
|
55
|
-
resolutions = (1..15).map do |i|
|
56
|
-
"https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
h["resolutions"] = resolutions.map do |res_link|
|
61
|
-
res_id = (res_link.split('-')[2] || 0).to_i
|
62
|
-
res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
|
63
|
-
res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
|
64
|
-
|
65
|
-
Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
|
66
|
-
end
|
67
|
-
|
68
|
-
FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
|
69
|
-
File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
|
70
|
-
end
|
71
|
-
end
|
3
|
+
warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
|
@@ -1,20 +1,19 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require_relative
|
1
|
+
require "mechanize"
|
2
|
+
require "coradoc/input/html"
|
3
|
+
require "vcr"
|
4
|
+
require "date"
|
5
|
+
require "fileutils"
|
6
|
+
require "pry"
|
7
|
+
require_relative "asciimath"
|
8
8
|
|
9
9
|
VCR.configure do |c|
|
10
|
-
c.cassette_library_dir = __dir__+
|
10
|
+
c.cassette_library_dir = __dir__ + "/../../../../cassettes"
|
11
11
|
c.hook_into :webmock
|
12
12
|
end
|
13
13
|
|
14
14
|
module Bipm
|
15
15
|
module Data
|
16
16
|
module Importer
|
17
|
-
|
18
17
|
CONSIDERATIONS = {
|
19
18
|
/(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
|
20
19
|
/(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
|
@@ -50,7 +49,7 @@ module Bipm
|
|
50
49
|
/(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
|
51
50
|
/(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
|
52
51
|
/(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
|
53
|
-
/(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
52
|
+
/(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
|
54
53
|
/(?:resolve[sd]?)/i => "resolves",
|
55
54
|
/(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
|
56
55
|
/(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
|
@@ -79,50 +78,52 @@ module Bipm
|
|
79
78
|
/(?:empowers|habilite)/i => "empowers",
|
80
79
|
}
|
81
80
|
|
82
|
-
PREFIX1
|
83
|
-
PREFIX2
|
84
|
-
PREFIX3
|
85
|
-
PREFIX4
|
86
|
-
PREFIX5
|
87
|
-
PREFIX6
|
81
|
+
PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
|
82
|
+
PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
|
83
|
+
PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
|
84
|
+
PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
|
85
|
+
PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
|
86
|
+
PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
|
87
|
+
|
88
|
+
PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
|
88
89
|
|
89
|
-
|
90
|
+
SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
90
91
|
|
91
|
-
|
92
|
+
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
92
93
|
|
93
94
|
module Common
|
94
|
-
def replace_links
|
95
|
-
ps.css(
|
96
|
-
href = a.attr(
|
95
|
+
def replace_links(ps, res, lang)
|
96
|
+
ps.css("a[href]").each do |a|
|
97
|
+
href = a.attr("href")
|
97
98
|
|
98
|
-
href = href.gsub(%r'\Ahttps://www
|
99
|
+
href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
|
99
100
|
|
100
101
|
# Correct links
|
101
|
-
href = href.gsub(
|
102
|
+
href = href.gsub("/web/guest/", "/#{lang}/")
|
102
103
|
|
103
104
|
# Account for some mistakes from an upstream document
|
104
|
-
href = href.gsub(%r"\A/jen/",
|
105
|
-
href = href.gsub(%r"\A/en/CGPM/jsp/",
|
105
|
+
href = href.gsub(%r"\A/jen/", "/en/")
|
106
|
+
href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
|
106
107
|
|
107
108
|
href = case href
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
109
|
+
when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
|
110
|
+
%r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
|
111
|
+
%r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
|
112
|
+
"cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
113
|
+
when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
|
114
|
+
"cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
115
|
+
when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
|
116
|
+
"cipm-decisions:#{$1}/#{$2}#{$3}"
|
117
|
+
else
|
118
|
+
URI(res.uri).merge(href).to_s # Relative -> absolute
|
119
|
+
end
|
119
120
|
|
120
|
-
a.set_attribute(
|
121
|
+
a.set_attribute("href", href)
|
121
122
|
end
|
122
123
|
end
|
123
124
|
|
124
|
-
def replace_centers
|
125
|
-
centers = ps.css(
|
125
|
+
def replace_centers(ps)
|
126
|
+
centers = ps.css("center").to_a
|
126
127
|
while centers.length > 0
|
127
128
|
center = centers.first
|
128
129
|
current = center
|
@@ -131,7 +132,7 @@ module Bipm
|
|
131
132
|
break unless current.next
|
132
133
|
while Nokogiri::XML::Text === current.next
|
133
134
|
current = current.next
|
134
|
-
break if current.text.strip !=
|
135
|
+
break if current.text.strip != ""
|
135
136
|
end
|
136
137
|
break unless current.next
|
137
138
|
break unless current.next.name == "center"
|
@@ -157,47 +158,52 @@ module Bipm
|
|
157
158
|
end
|
158
159
|
|
159
160
|
# Remove the remaining centers
|
160
|
-
ps.css(
|
161
|
+
ps.css("center").each do |i|
|
161
162
|
i.replace i.inner_html
|
162
163
|
end
|
163
164
|
end
|
164
165
|
|
165
|
-
def format_message
|
166
|
+
def format_message(part)
|
166
167
|
AsciiMath.asciidoc_extract_math(
|
167
|
-
|
168
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", " ").gsub(" \n", "\n")
|
168
169
|
)
|
170
|
+
rescue
|
171
|
+
warn "Bug in Coradoc, couldn't parse the following document:"
|
172
|
+
pp part
|
173
|
+
warn "Please report this as an issue to https://github.com/metanorma/coradoc"
|
174
|
+
raise
|
169
175
|
end
|
170
176
|
|
171
|
-
def ng_to_string
|
172
|
-
ps.inner_html.encode(
|
177
|
+
def ng_to_string(ps)
|
178
|
+
ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
|
173
179
|
end
|
174
180
|
|
175
|
-
def parse_resolution
|
181
|
+
def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
|
176
182
|
# Reparse the document after fixing upstream syntax
|
177
183
|
fixed_body = res.body.gsub("<name=", "<a name=")
|
178
|
-
fixed_body = fixed_body.force_encoding(
|
179
|
-
fixed_body = fixed_body.gsub(
|
180
|
-
fixed_body = fixed_body.gsub(
|
181
|
-
fixed_body = fixed_body.gsub(
|
182
|
-
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m,
|
184
|
+
fixed_body = fixed_body.force_encoding("utf-8")
|
185
|
+
fixed_body = fixed_body.gsub("É", "É")
|
186
|
+
fixed_body = fixed_body.gsub("« ", "« ")
|
187
|
+
fixed_body = fixed_body.gsub(" »", " »")
|
188
|
+
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
|
183
189
|
supertitle = $1.strip
|
184
|
-
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m,
|
190
|
+
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
|
185
191
|
title = $1.strip
|
186
192
|
fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
|
187
193
|
fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
|
188
194
|
ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
|
189
195
|
|
190
|
-
refs = ng.css(
|
196
|
+
refs = ng.css(".publication-card_reference a")
|
191
197
|
|
192
198
|
if rec_type.end_with? "?"
|
193
199
|
rec_type = case supertitle
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
200
|
+
when /\AD[eé]claration/
|
201
|
+
"statement"
|
202
|
+
when /\AR[eé]solution/
|
203
|
+
"resolution"
|
204
|
+
else
|
205
|
+
rec_type[..-2]
|
206
|
+
end
|
201
207
|
end
|
202
208
|
|
203
209
|
r = {
|
@@ -220,7 +226,7 @@ module Bipm
|
|
220
226
|
r.delete("type") unless r["type"]
|
221
227
|
|
222
228
|
if refs.length > 0
|
223
|
-
r["reference"] = res.uri.merge(refs.first.attr(
|
229
|
+
r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
|
224
230
|
name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
|
225
231
|
r["reference_name"] = name
|
226
232
|
if page
|
@@ -234,7 +240,7 @@ module Bipm
|
|
234
240
|
r.delete("reference_page")
|
235
241
|
end
|
236
242
|
|
237
|
-
ps = ng.css(
|
243
|
+
ps = ng.css("div.journal-content-article").first
|
238
244
|
|
239
245
|
#binding.pry if ps.count != 1
|
240
246
|
|
@@ -247,6 +253,11 @@ module Bipm
|
|
247
253
|
doc = Common.ng_to_string(ps)
|
248
254
|
# doc = AsciiMath.html_to_asciimath(doc)
|
249
255
|
|
256
|
+
if doc.match? DOIREGEX
|
257
|
+
doc = doc.sub(DOIREGEX, "")
|
258
|
+
r["doi"] = $1
|
259
|
+
end
|
260
|
+
|
250
261
|
parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
|
251
262
|
nparts = [parts.shift]
|
252
263
|
while parts.length > 0
|
@@ -267,13 +278,13 @@ module Bipm
|
|
267
278
|
next
|
268
279
|
end
|
269
280
|
|
270
|
-
if parse.start_with?
|
271
|
-
part = part.sub(
|
281
|
+
if parse.start_with? "NOTE"
|
282
|
+
part = part.sub("<h3>NOTE</h3>", "")
|
272
283
|
r["notes"] = Common.format_message(part)
|
273
284
|
next
|
274
285
|
end
|
275
286
|
|
276
|
-
CONSIDERATIONS.any? do |k,v|
|
287
|
+
CONSIDERATIONS.any? do |k, v|
|
277
288
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
278
289
|
r["considerations"] << prev = {
|
279
290
|
"type" => v,
|
@@ -283,7 +294,7 @@ module Bipm
|
|
283
294
|
end
|
284
295
|
end && next
|
285
296
|
|
286
|
-
ACTIONS.any? do |k,v|
|
297
|
+
ACTIONS.any? do |k, v|
|
287
298
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
288
299
|
r["actions"] << prev = {
|
289
300
|
"type" => v,
|
@@ -314,13 +325,13 @@ module Bipm
|
|
314
325
|
end
|
315
326
|
|
316
327
|
%w[considerations actions].each do |type|
|
317
|
-
map = type ==
|
328
|
+
map = type == "actions" ? ACTIONS : CONSIDERATIONS
|
318
329
|
r[type] = r[type].map do |i|
|
319
330
|
islist = false
|
320
331
|
|
321
332
|
kk = nil
|
322
333
|
|
323
|
-
if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
334
|
+
if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
324
335
|
prefix = $2
|
325
336
|
suffix = $3
|
326
337
|
subject = $4
|
@@ -347,15 +358,15 @@ module Bipm
|
|
347
358
|
|
348
359
|
if subject
|
349
360
|
#p subject
|
350
|
-
r[
|
351
|
-
r[
|
361
|
+
r["subject"] ||= []
|
362
|
+
r["subject"] << subject
|
352
363
|
end
|
353
364
|
|
354
365
|
if islist
|
355
366
|
suffix = suffix.strip
|
356
|
-
suffix = nil if suffix ==
|
367
|
+
suffix = nil if suffix == ""
|
357
368
|
listitems.map do |li|
|
358
|
-
i.merge
|
369
|
+
i.merge "message" => [prefix, suffix, li].compact.join(" ")
|
359
370
|
end
|
360
371
|
else
|
361
372
|
i
|
@@ -363,13 +374,13 @@ module Bipm
|
|
363
374
|
end.flatten
|
364
375
|
end
|
365
376
|
|
366
|
-
if r[
|
367
|
-
r[
|
377
|
+
if r["subject"]
|
378
|
+
r["subject"] = r["subject"].uniq.join(" and ")
|
368
379
|
end
|
369
380
|
|
370
381
|
# Note: we replace the previously set r['subject'].
|
371
|
-
r[
|
372
|
-
r[
|
382
|
+
r["subject"] = type.to_s.upcase.gsub("-", " ")
|
383
|
+
r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
|
373
384
|
|
374
385
|
r
|
375
386
|
end
|
@@ -377,11 +388,11 @@ module Bipm
|
|
377
388
|
def extract_pdf(meeting, lang)
|
378
389
|
pdfs = meeting.css('a.title-third[href*=".pdf"]')
|
379
390
|
.map { |i| i.attr("href") }
|
380
|
-
.map { |i| i.split(
|
391
|
+
.map { |i| i.split("?").first }
|
381
392
|
.select do |i|
|
382
|
-
|
383
|
-
|
384
|
-
|
393
|
+
i.downcase.include?("-#{lang}.pdf") ||
|
394
|
+
%w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
|
395
|
+
end
|
385
396
|
|
386
397
|
pdfs = pdfs.first if pdfs.length <= 1
|
387
398
|
|
@@ -389,8 +400,10 @@ module Bipm
|
|
389
400
|
end
|
390
401
|
|
391
402
|
def extract_date(date_str)
|
403
|
+
return nil unless date_str
|
404
|
+
|
392
405
|
date = date_str.strip
|
393
|
-
.gsub(/\s+/,
|
406
|
+
.gsub(/\s+/, " ")
|
394
407
|
.gsub("février", "february") # 3 first letters must match English
|
395
408
|
.gsub("juin", "june")
|
396
409
|
.gsub("avril", "april")
|
@@ -411,7 +424,6 @@ module Bipm
|
|
411
424
|
|
412
425
|
extend self
|
413
426
|
end
|
414
|
-
|
415
427
|
end
|
416
428
|
end
|
417
429
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bipm-data-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-12-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coradoc
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
206
|
- !ruby/object:Gem::Version
|
207
207
|
version: '0'
|
208
208
|
requirements: []
|
209
|
-
rubygems_version: 3.3.
|
209
|
+
rubygems_version: 3.3.27
|
210
210
|
signing_key:
|
211
211
|
specification_version: 4
|
212
212
|
summary: Importer for BIPM CGPM and CIPM content
|