bipm-data-importer 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bipm/data/importer/common.rb +80 -82
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
|
|
4
|
+
data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
|
|
7
|
+
data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
|
|
@@ -1,20 +1,19 @@
|
|
|
1
|
-
require
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
require
|
|
5
|
-
require
|
|
6
|
-
require
|
|
7
|
-
require_relative
|
|
1
|
+
require "mechanize"
|
|
2
|
+
require "coradoc/input/html"
|
|
3
|
+
require "vcr"
|
|
4
|
+
require "date"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "pry"
|
|
7
|
+
require_relative "asciimath"
|
|
8
8
|
|
|
9
9
|
VCR.configure do |c|
|
|
10
|
-
c.cassette_library_dir = __dir__+
|
|
10
|
+
c.cassette_library_dir = __dir__ + "/../../../../cassettes"
|
|
11
11
|
c.hook_into :webmock
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
module Bipm
|
|
15
15
|
module Data
|
|
16
16
|
module Importer
|
|
17
|
-
|
|
18
17
|
CONSIDERATIONS = {
|
|
19
18
|
/(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
|
|
20
19
|
/(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
|
|
@@ -79,52 +78,52 @@ module Bipm
|
|
|
79
78
|
/(?:empowers|habilite)/i => "empowers",
|
|
80
79
|
}
|
|
81
80
|
|
|
82
|
-
PREFIX1
|
|
83
|
-
PREFIX2
|
|
84
|
-
PREFIX3
|
|
85
|
-
PREFIX4
|
|
86
|
-
PREFIX5
|
|
87
|
-
PREFIX6
|
|
81
|
+
PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
|
|
82
|
+
PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
|
|
83
|
+
PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
|
|
84
|
+
PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
|
|
85
|
+
PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
|
|
86
|
+
PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
|
|
88
87
|
|
|
89
|
-
PREFIX
|
|
88
|
+
PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
|
|
90
89
|
|
|
91
|
-
SUFFIX
|
|
90
|
+
SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
|
92
91
|
|
|
93
92
|
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
|
94
93
|
|
|
95
94
|
module Common
|
|
96
|
-
def replace_links
|
|
97
|
-
ps.css(
|
|
98
|
-
href = a.attr(
|
|
95
|
+
def replace_links(ps, res, lang)
|
|
96
|
+
ps.css("a[href]").each do |a|
|
|
97
|
+
href = a.attr("href")
|
|
99
98
|
|
|
100
|
-
href = href.gsub(%r'\Ahttps://www
|
|
99
|
+
href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
|
|
101
100
|
|
|
102
101
|
# Correct links
|
|
103
|
-
href = href.gsub(
|
|
102
|
+
href = href.gsub("/web/guest/", "/#{lang}/")
|
|
104
103
|
|
|
105
104
|
# Account for some mistakes from an upstream document
|
|
106
|
-
href = href.gsub(%r"\A/jen/",
|
|
107
|
-
href = href.gsub(%r"\A/en/CGPM/jsp/",
|
|
105
|
+
href = href.gsub(%r"\A/jen/", "/en/")
|
|
106
|
+
href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
|
|
108
107
|
|
|
109
108
|
href = case href
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
109
|
+
when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
|
|
110
|
+
%r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
|
|
111
|
+
%r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
|
|
112
|
+
"cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
|
113
|
+
when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
|
|
114
|
+
"cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
|
115
|
+
when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
|
|
116
|
+
"cipm-decisions:#{$1}/#{$2}#{$3}"
|
|
117
|
+
else
|
|
118
|
+
URI(res.uri).merge(href).to_s # Relative -> absolute
|
|
119
|
+
end
|
|
121
120
|
|
|
122
|
-
a.set_attribute(
|
|
121
|
+
a.set_attribute("href", href)
|
|
123
122
|
end
|
|
124
123
|
end
|
|
125
124
|
|
|
126
|
-
def replace_centers
|
|
127
|
-
centers = ps.css(
|
|
125
|
+
def replace_centers(ps)
|
|
126
|
+
centers = ps.css("center").to_a
|
|
128
127
|
while centers.length > 0
|
|
129
128
|
center = centers.first
|
|
130
129
|
current = center
|
|
@@ -133,7 +132,7 @@ module Bipm
|
|
|
133
132
|
break unless current.next
|
|
134
133
|
while Nokogiri::XML::Text === current.next
|
|
135
134
|
current = current.next
|
|
136
|
-
break if current.text.strip !=
|
|
135
|
+
break if current.text.strip != ""
|
|
137
136
|
end
|
|
138
137
|
break unless current.next
|
|
139
138
|
break unless current.next.name == "center"
|
|
@@ -159,14 +158,14 @@ module Bipm
|
|
|
159
158
|
end
|
|
160
159
|
|
|
161
160
|
# Remove the remaining centers
|
|
162
|
-
ps.css(
|
|
161
|
+
ps.css("center").each do |i|
|
|
163
162
|
i.replace i.inner_html
|
|
164
163
|
end
|
|
165
164
|
end
|
|
166
165
|
|
|
167
|
-
def format_message
|
|
166
|
+
def format_message(part)
|
|
168
167
|
AsciiMath.asciidoc_extract_math(
|
|
169
|
-
Coradoc::Input::HTML.convert(part).strip.gsub(" ",
|
|
168
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", " ").gsub(" \n", "\n")
|
|
170
169
|
)
|
|
171
170
|
rescue
|
|
172
171
|
warn "Bug in Coradoc, couldn't parse the following document:"
|
|
@@ -175,36 +174,36 @@ module Bipm
|
|
|
175
174
|
raise
|
|
176
175
|
end
|
|
177
176
|
|
|
178
|
-
def ng_to_string
|
|
179
|
-
ps.inner_html.encode(
|
|
177
|
+
def ng_to_string(ps)
|
|
178
|
+
ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
|
|
180
179
|
end
|
|
181
180
|
|
|
182
|
-
def parse_resolution
|
|
181
|
+
def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
|
|
183
182
|
# Reparse the document after fixing upstream syntax
|
|
184
183
|
fixed_body = res.body.gsub("<name=", "<a name=")
|
|
185
|
-
fixed_body = fixed_body.force_encoding(
|
|
186
|
-
fixed_body = fixed_body.gsub(
|
|
187
|
-
fixed_body = fixed_body.gsub(
|
|
188
|
-
fixed_body = fixed_body.gsub(
|
|
189
|
-
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m,
|
|
184
|
+
fixed_body = fixed_body.force_encoding("utf-8")
|
|
185
|
+
fixed_body = fixed_body.gsub("É", "É")
|
|
186
|
+
fixed_body = fixed_body.gsub("« ", "« ")
|
|
187
|
+
fixed_body = fixed_body.gsub(" »", " »")
|
|
188
|
+
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
|
|
190
189
|
supertitle = $1.strip
|
|
191
|
-
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m,
|
|
190
|
+
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
|
|
192
191
|
title = $1.strip
|
|
193
192
|
fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
|
|
194
193
|
fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
|
|
195
194
|
ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
|
|
196
195
|
|
|
197
|
-
refs = ng.css(
|
|
196
|
+
refs = ng.css(".publication-card_reference a")
|
|
198
197
|
|
|
199
198
|
if rec_type.end_with? "?"
|
|
200
199
|
rec_type = case supertitle
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
200
|
+
when /\AD[eé]claration/
|
|
201
|
+
"statement"
|
|
202
|
+
when /\AR[eé]solution/
|
|
203
|
+
"resolution"
|
|
204
|
+
else
|
|
205
|
+
rec_type[..-2]
|
|
206
|
+
end
|
|
208
207
|
end
|
|
209
208
|
|
|
210
209
|
r = {
|
|
@@ -227,7 +226,7 @@ module Bipm
|
|
|
227
226
|
r.delete("type") unless r["type"]
|
|
228
227
|
|
|
229
228
|
if refs.length > 0
|
|
230
|
-
r["reference"] = res.uri.merge(refs.first.attr(
|
|
229
|
+
r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
|
|
231
230
|
name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
|
|
232
231
|
r["reference_name"] = name
|
|
233
232
|
if page
|
|
@@ -241,7 +240,7 @@ module Bipm
|
|
|
241
240
|
r.delete("reference_page")
|
|
242
241
|
end
|
|
243
242
|
|
|
244
|
-
ps = ng.css(
|
|
243
|
+
ps = ng.css("div.journal-content-article").first
|
|
245
244
|
|
|
246
245
|
#binding.pry if ps.count != 1
|
|
247
246
|
|
|
@@ -255,7 +254,7 @@ module Bipm
|
|
|
255
254
|
# doc = AsciiMath.html_to_asciimath(doc)
|
|
256
255
|
|
|
257
256
|
if doc.match? DOIREGEX
|
|
258
|
-
doc = doc.sub(DOIREGEX,
|
|
257
|
+
doc = doc.sub(DOIREGEX, "")
|
|
259
258
|
r["doi"] = $1
|
|
260
259
|
end
|
|
261
260
|
|
|
@@ -279,13 +278,13 @@ module Bipm
|
|
|
279
278
|
next
|
|
280
279
|
end
|
|
281
280
|
|
|
282
|
-
if parse.start_with?
|
|
283
|
-
part = part.sub(
|
|
281
|
+
if parse.start_with? "NOTE"
|
|
282
|
+
part = part.sub("<h3>NOTE</h3>", "")
|
|
284
283
|
r["notes"] = Common.format_message(part)
|
|
285
284
|
next
|
|
286
285
|
end
|
|
287
286
|
|
|
288
|
-
CONSIDERATIONS.any? do |k,v|
|
|
287
|
+
CONSIDERATIONS.any? do |k, v|
|
|
289
288
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
|
290
289
|
r["considerations"] << prev = {
|
|
291
290
|
"type" => v,
|
|
@@ -295,7 +294,7 @@ module Bipm
|
|
|
295
294
|
end
|
|
296
295
|
end && next
|
|
297
296
|
|
|
298
|
-
ACTIONS.any? do |k,v|
|
|
297
|
+
ACTIONS.any? do |k, v|
|
|
299
298
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
|
300
299
|
r["actions"] << prev = {
|
|
301
300
|
"type" => v,
|
|
@@ -326,13 +325,13 @@ module Bipm
|
|
|
326
325
|
end
|
|
327
326
|
|
|
328
327
|
%w[considerations actions].each do |type|
|
|
329
|
-
map = type ==
|
|
328
|
+
map = type == "actions" ? ACTIONS : CONSIDERATIONS
|
|
330
329
|
r[type] = r[type].map do |i|
|
|
331
330
|
islist = false
|
|
332
331
|
|
|
333
332
|
kk = nil
|
|
334
333
|
|
|
335
|
-
if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
|
334
|
+
if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
|
336
335
|
prefix = $2
|
|
337
336
|
suffix = $3
|
|
338
337
|
subject = $4
|
|
@@ -359,15 +358,15 @@ module Bipm
|
|
|
359
358
|
|
|
360
359
|
if subject
|
|
361
360
|
#p subject
|
|
362
|
-
r[
|
|
363
|
-
r[
|
|
361
|
+
r["subject"] ||= []
|
|
362
|
+
r["subject"] << subject
|
|
364
363
|
end
|
|
365
364
|
|
|
366
365
|
if islist
|
|
367
366
|
suffix = suffix.strip
|
|
368
|
-
suffix = nil if suffix ==
|
|
367
|
+
suffix = nil if suffix == ""
|
|
369
368
|
listitems.map do |li|
|
|
370
|
-
i.merge
|
|
369
|
+
i.merge "message" => [prefix, suffix, li].compact.join(" ")
|
|
371
370
|
end
|
|
372
371
|
else
|
|
373
372
|
i
|
|
@@ -375,13 +374,13 @@ module Bipm
|
|
|
375
374
|
end.flatten
|
|
376
375
|
end
|
|
377
376
|
|
|
378
|
-
if r[
|
|
379
|
-
r[
|
|
377
|
+
if r["subject"]
|
|
378
|
+
r["subject"] = r["subject"].uniq.join(" and ")
|
|
380
379
|
end
|
|
381
380
|
|
|
382
381
|
# Note: we replace the previously set r['subject'].
|
|
383
|
-
r[
|
|
384
|
-
r[
|
|
382
|
+
r["subject"] = type.to_s.upcase.gsub("-", " ")
|
|
383
|
+
r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
|
|
385
384
|
|
|
386
385
|
r
|
|
387
386
|
end
|
|
@@ -389,11 +388,11 @@ module Bipm
|
|
|
389
388
|
def extract_pdf(meeting, lang)
|
|
390
389
|
pdfs = meeting.css('a.title-third[href*=".pdf"]')
|
|
391
390
|
.map { |i| i.attr("href") }
|
|
392
|
-
.map { |i| i.split(
|
|
391
|
+
.map { |i| i.split("?").first }
|
|
393
392
|
.select do |i|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
393
|
+
i.downcase.include?("-#{lang}.pdf") ||
|
|
394
|
+
%w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
|
|
395
|
+
end
|
|
397
396
|
|
|
398
397
|
pdfs = pdfs.first if pdfs.length <= 1
|
|
399
398
|
|
|
@@ -404,7 +403,7 @@ module Bipm
|
|
|
404
403
|
return nil unless date_str
|
|
405
404
|
|
|
406
405
|
date = date_str.strip
|
|
407
|
-
.gsub(/\s+/,
|
|
406
|
+
.gsub(/\s+/, " ")
|
|
408
407
|
.gsub("février", "february") # 3 first letters must match English
|
|
409
408
|
.gsub("juin", "june")
|
|
410
409
|
.gsub("avril", "april")
|
|
@@ -425,7 +424,6 @@ module Bipm
|
|
|
425
424
|
|
|
426
425
|
extend self
|
|
427
426
|
end
|
|
428
|
-
|
|
429
427
|
end
|
|
430
428
|
end
|
|
431
429
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bipm-data-importer
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-12-
|
|
11
|
+
date: 2024-12-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|