bipm-data-importer 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bipm/data/importer/common.rb +80 -82
- data/lib/bipm/data/importer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
|
4
|
+
data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
|
7
|
+
data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
|
@@ -1,20 +1,19 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require_relative
|
1
|
+
require "mechanize"
|
2
|
+
require "coradoc/input/html"
|
3
|
+
require "vcr"
|
4
|
+
require "date"
|
5
|
+
require "fileutils"
|
6
|
+
require "pry"
|
7
|
+
require_relative "asciimath"
|
8
8
|
|
9
9
|
VCR.configure do |c|
|
10
|
-
c.cassette_library_dir = __dir__+
|
10
|
+
c.cassette_library_dir = __dir__ + "/../../../../cassettes"
|
11
11
|
c.hook_into :webmock
|
12
12
|
end
|
13
13
|
|
14
14
|
module Bipm
|
15
15
|
module Data
|
16
16
|
module Importer
|
17
|
-
|
18
17
|
CONSIDERATIONS = {
|
19
18
|
/(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
|
20
19
|
/(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
|
@@ -79,52 +78,52 @@ module Bipm
|
|
79
78
|
/(?:empowers|habilite)/i => "empowers",
|
80
79
|
}
|
81
80
|
|
82
|
-
PREFIX1
|
83
|
-
PREFIX2
|
84
|
-
PREFIX3
|
85
|
-
PREFIX4
|
86
|
-
PREFIX5
|
87
|
-
PREFIX6
|
81
|
+
PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
|
82
|
+
PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
|
83
|
+
PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
|
84
|
+
PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
|
85
|
+
PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
|
86
|
+
PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
|
88
87
|
|
89
|
-
PREFIX
|
88
|
+
PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
|
90
89
|
|
91
|
-
SUFFIX
|
90
|
+
SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
|
92
91
|
|
93
92
|
DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
|
94
93
|
|
95
94
|
module Common
|
96
|
-
def replace_links
|
97
|
-
ps.css(
|
98
|
-
href = a.attr(
|
95
|
+
def replace_links(ps, res, lang)
|
96
|
+
ps.css("a[href]").each do |a|
|
97
|
+
href = a.attr("href")
|
99
98
|
|
100
|
-
href = href.gsub(%r'\Ahttps://www
|
99
|
+
href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
|
101
100
|
|
102
101
|
# Correct links
|
103
|
-
href = href.gsub(
|
102
|
+
href = href.gsub("/web/guest/", "/#{lang}/")
|
104
103
|
|
105
104
|
# Account for some mistakes from an upstream document
|
106
|
-
href = href.gsub(%r"\A/jen/",
|
107
|
-
href = href.gsub(%r"\A/en/CGPM/jsp/",
|
105
|
+
href = href.gsub(%r"\A/jen/", "/en/")
|
106
|
+
href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
|
108
107
|
|
109
108
|
href = case href
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
109
|
+
when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
|
110
|
+
%r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
|
111
|
+
%r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
|
112
|
+
"cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
113
|
+
when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
|
114
|
+
"cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
|
115
|
+
when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
|
116
|
+
"cipm-decisions:#{$1}/#{$2}#{$3}"
|
117
|
+
else
|
118
|
+
URI(res.uri).merge(href).to_s # Relative -> absolute
|
119
|
+
end
|
121
120
|
|
122
|
-
a.set_attribute(
|
121
|
+
a.set_attribute("href", href)
|
123
122
|
end
|
124
123
|
end
|
125
124
|
|
126
|
-
def replace_centers
|
127
|
-
centers = ps.css(
|
125
|
+
def replace_centers(ps)
|
126
|
+
centers = ps.css("center").to_a
|
128
127
|
while centers.length > 0
|
129
128
|
center = centers.first
|
130
129
|
current = center
|
@@ -133,7 +132,7 @@ module Bipm
|
|
133
132
|
break unless current.next
|
134
133
|
while Nokogiri::XML::Text === current.next
|
135
134
|
current = current.next
|
136
|
-
break if current.text.strip !=
|
135
|
+
break if current.text.strip != ""
|
137
136
|
end
|
138
137
|
break unless current.next
|
139
138
|
break unless current.next.name == "center"
|
@@ -159,14 +158,14 @@ module Bipm
|
|
159
158
|
end
|
160
159
|
|
161
160
|
# Remove the remaining centers
|
162
|
-
ps.css(
|
161
|
+
ps.css("center").each do |i|
|
163
162
|
i.replace i.inner_html
|
164
163
|
end
|
165
164
|
end
|
166
165
|
|
167
|
-
def format_message
|
166
|
+
def format_message(part)
|
168
167
|
AsciiMath.asciidoc_extract_math(
|
169
|
-
Coradoc::Input::HTML.convert(part).strip.gsub(" ",
|
168
|
+
Coradoc::Input::HTML.convert(part).strip.gsub(" ", " ").gsub(" \n", "\n")
|
170
169
|
)
|
171
170
|
rescue
|
172
171
|
warn "Bug in Coradoc, couldn't parse the following document:"
|
@@ -175,36 +174,36 @@ module Bipm
|
|
175
174
|
raise
|
176
175
|
end
|
177
176
|
|
178
|
-
def ng_to_string
|
179
|
-
ps.inner_html.encode(
|
177
|
+
def ng_to_string(ps)
|
178
|
+
ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
|
180
179
|
end
|
181
180
|
|
182
|
-
def parse_resolution
|
181
|
+
def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
|
183
182
|
# Reparse the document after fixing upstream syntax
|
184
183
|
fixed_body = res.body.gsub("<name=", "<a name=")
|
185
|
-
fixed_body = fixed_body.force_encoding(
|
186
|
-
fixed_body = fixed_body.gsub(
|
187
|
-
fixed_body = fixed_body.gsub(
|
188
|
-
fixed_body = fixed_body.gsub(
|
189
|
-
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m,
|
184
|
+
fixed_body = fixed_body.force_encoding("utf-8")
|
185
|
+
fixed_body = fixed_body.gsub("É", "É")
|
186
|
+
fixed_body = fixed_body.gsub("« ", "« ")
|
187
|
+
fixed_body = fixed_body.gsub(" »", " »")
|
188
|
+
fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
|
190
189
|
supertitle = $1.strip
|
191
|
-
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m,
|
190
|
+
fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
|
192
191
|
title = $1.strip
|
193
192
|
fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
|
194
193
|
fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
|
195
194
|
ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
|
196
195
|
|
197
|
-
refs = ng.css(
|
196
|
+
refs = ng.css(".publication-card_reference a")
|
198
197
|
|
199
198
|
if rec_type.end_with? "?"
|
200
199
|
rec_type = case supertitle
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
200
|
+
when /\AD[eé]claration/
|
201
|
+
"statement"
|
202
|
+
when /\AR[eé]solution/
|
203
|
+
"resolution"
|
204
|
+
else
|
205
|
+
rec_type[..-2]
|
206
|
+
end
|
208
207
|
end
|
209
208
|
|
210
209
|
r = {
|
@@ -227,7 +226,7 @@ module Bipm
|
|
227
226
|
r.delete("type") unless r["type"]
|
228
227
|
|
229
228
|
if refs.length > 0
|
230
|
-
r["reference"] = res.uri.merge(refs.first.attr(
|
229
|
+
r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
|
231
230
|
name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
|
232
231
|
r["reference_name"] = name
|
233
232
|
if page
|
@@ -241,7 +240,7 @@ module Bipm
|
|
241
240
|
r.delete("reference_page")
|
242
241
|
end
|
243
242
|
|
244
|
-
ps = ng.css(
|
243
|
+
ps = ng.css("div.journal-content-article").first
|
245
244
|
|
246
245
|
#binding.pry if ps.count != 1
|
247
246
|
|
@@ -255,7 +254,7 @@ module Bipm
|
|
255
254
|
# doc = AsciiMath.html_to_asciimath(doc)
|
256
255
|
|
257
256
|
if doc.match? DOIREGEX
|
258
|
-
doc = doc.sub(DOIREGEX,
|
257
|
+
doc = doc.sub(DOIREGEX, "")
|
259
258
|
r["doi"] = $1
|
260
259
|
end
|
261
260
|
|
@@ -279,13 +278,13 @@ module Bipm
|
|
279
278
|
next
|
280
279
|
end
|
281
280
|
|
282
|
-
if parse.start_with?
|
283
|
-
part = part.sub(
|
281
|
+
if parse.start_with? "NOTE"
|
282
|
+
part = part.sub("<h3>NOTE</h3>", "")
|
284
283
|
r["notes"] = Common.format_message(part)
|
285
284
|
next
|
286
285
|
end
|
287
286
|
|
288
|
-
CONSIDERATIONS.any? do |k,v|
|
287
|
+
CONSIDERATIONS.any? do |k, v|
|
289
288
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
290
289
|
r["considerations"] << prev = {
|
291
290
|
"type" => v,
|
@@ -295,7 +294,7 @@ module Bipm
|
|
295
294
|
end
|
296
295
|
end && next
|
297
296
|
|
298
|
-
ACTIONS.any? do |k,v|
|
297
|
+
ACTIONS.any? do |k, v|
|
299
298
|
if parse =~ /\A#{PREFIX}#{k}\b/i
|
300
299
|
r["actions"] << prev = {
|
301
300
|
"type" => v,
|
@@ -326,13 +325,13 @@ module Bipm
|
|
326
325
|
end
|
327
326
|
|
328
327
|
%w[considerations actions].each do |type|
|
329
|
-
map = type ==
|
328
|
+
map = type == "actions" ? ACTIONS : CONSIDERATIONS
|
330
329
|
r[type] = r[type].map do |i|
|
331
330
|
islist = false
|
332
331
|
|
333
332
|
kk = nil
|
334
333
|
|
335
|
-
if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
334
|
+
if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
|
336
335
|
prefix = $2
|
337
336
|
suffix = $3
|
338
337
|
subject = $4
|
@@ -359,15 +358,15 @@ module Bipm
|
|
359
358
|
|
360
359
|
if subject
|
361
360
|
#p subject
|
362
|
-
r[
|
363
|
-
r[
|
361
|
+
r["subject"] ||= []
|
362
|
+
r["subject"] << subject
|
364
363
|
end
|
365
364
|
|
366
365
|
if islist
|
367
366
|
suffix = suffix.strip
|
368
|
-
suffix = nil if suffix ==
|
367
|
+
suffix = nil if suffix == ""
|
369
368
|
listitems.map do |li|
|
370
|
-
i.merge
|
369
|
+
i.merge "message" => [prefix, suffix, li].compact.join(" ")
|
371
370
|
end
|
372
371
|
else
|
373
372
|
i
|
@@ -375,13 +374,13 @@ module Bipm
|
|
375
374
|
end.flatten
|
376
375
|
end
|
377
376
|
|
378
|
-
if r[
|
379
|
-
r[
|
377
|
+
if r["subject"]
|
378
|
+
r["subject"] = r["subject"].uniq.join(" and ")
|
380
379
|
end
|
381
380
|
|
382
381
|
# Note: we replace the previously set r['subject'].
|
383
|
-
r[
|
384
|
-
r[
|
382
|
+
r["subject"] = type.to_s.upcase.gsub("-", " ")
|
383
|
+
r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
|
385
384
|
|
386
385
|
r
|
387
386
|
end
|
@@ -389,11 +388,11 @@ module Bipm
|
|
389
388
|
def extract_pdf(meeting, lang)
|
390
389
|
pdfs = meeting.css('a.title-third[href*=".pdf"]')
|
391
390
|
.map { |i| i.attr("href") }
|
392
|
-
.map { |i| i.split(
|
391
|
+
.map { |i| i.split("?").first }
|
393
392
|
.select do |i|
|
394
|
-
|
395
|
-
|
396
|
-
|
393
|
+
i.downcase.include?("-#{lang}.pdf") ||
|
394
|
+
%w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
|
395
|
+
end
|
397
396
|
|
398
397
|
pdfs = pdfs.first if pdfs.length <= 1
|
399
398
|
|
@@ -404,7 +403,7 @@ module Bipm
|
|
404
403
|
return nil unless date_str
|
405
404
|
|
406
405
|
date = date_str.strip
|
407
|
-
.gsub(/\s+/,
|
406
|
+
.gsub(/\s+/, " ")
|
408
407
|
.gsub("février", "february") # 3 first letters must match English
|
409
408
|
.gsub("juin", "june")
|
410
409
|
.gsub("avril", "april")
|
@@ -425,7 +424,6 @@ module Bipm
|
|
425
424
|
|
426
425
|
extend self
|
427
426
|
end
|
428
|
-
|
429
427
|
end
|
430
428
|
end
|
431
429
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bipm-data-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-12-
|
11
|
+
date: 2024-12-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|