bipm-data-importer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
4
- data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
3
+ metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
4
+ data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
5
5
  SHA512:
6
- metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
7
- data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
6
+ metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
7
+ data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
@@ -1,20 +1,19 @@
1
- require 'mechanize'
2
- require 'coradoc/input/html'
3
- require 'vcr'
4
- require 'date'
5
- require 'fileutils'
6
- require 'pry'
7
- require_relative 'asciimath'
1
+ require "mechanize"
2
+ require "coradoc/input/html"
3
+ require "vcr"
4
+ require "date"
5
+ require "fileutils"
6
+ require "pry"
7
+ require_relative "asciimath"
8
8
 
9
9
  VCR.configure do |c|
10
- c.cassette_library_dir = __dir__+'/../../../../cassettes'
10
+ c.cassette_library_dir = __dir__ + "/../../../../cassettes"
11
11
  c.hook_into :webmock
12
12
  end
13
13
 
14
14
  module Bipm
15
15
  module Data
16
16
  module Importer
17
-
18
17
  CONSIDERATIONS = {
19
18
  /(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
20
19
  /(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
@@ -79,52 +78,52 @@ module Bipm
79
78
  /(?:empowers|habilite)/i => "empowers",
80
79
  }
81
80
 
82
- PREFIX1=/(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
83
- PREFIX2=/The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
84
- PREFIX3=/Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
85
- PREFIX4=/(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
86
- PREFIX5=/(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
87
- PREFIX6=/“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
81
+ PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
82
+ PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
83
+ PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
84
+ PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
85
+ PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
86
+ PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
88
87
 
89
- PREFIX=/(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
88
+ PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
90
89
 
91
- SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
90
+ SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
91
 
93
92
  DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
94
93
 
95
94
  module Common
96
- def replace_links ps, res, lang
97
- ps.css('a[href]').each do |a|
98
- href = a.attr('href')
95
+ def replace_links(ps, res, lang)
96
+ ps.css("a[href]").each do |a|
97
+ href = a.attr("href")
99
98
 
100
- href = href.gsub(%r'\Ahttps://www.bipm.org/', '')
99
+ href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
101
100
 
102
101
  # Correct links
103
- href = href.gsub('/web/guest/', "/#{lang}/")
102
+ href = href.gsub("/web/guest/", "/#{lang}/")
104
103
 
105
104
  # Account for some mistakes from an upstream document
106
- href = href.gsub(%r"\A/jen/", '/en/')
107
- href = href.gsub(%r"\A/en/CGPM/jsp/", '/en/CGPM/db/')
105
+ href = href.gsub(%r"\A/jen/", "/en/")
106
+ href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
108
107
 
109
108
  href = case href
110
- when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
111
- %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
112
- %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
113
- "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
114
- when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
115
- "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
116
- when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
117
- "cipm-decisions:#{$1}/#{$2}#{$3}"
118
- else
119
- URI(res.uri).merge(href).to_s # Relative -> absolute
120
- end
109
+ when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
110
+ %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
111
+ %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
112
+ "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
113
+ when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
114
+ "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
115
+ when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
116
+ "cipm-decisions:#{$1}/#{$2}#{$3}"
117
+ else
118
+ URI(res.uri).merge(href).to_s # Relative -> absolute
119
+ end
121
120
 
122
- a.set_attribute('href', href)
121
+ a.set_attribute("href", href)
123
122
  end
124
123
  end
125
124
 
126
- def replace_centers ps
127
- centers = ps.css('center').to_a
125
+ def replace_centers(ps)
126
+ centers = ps.css("center").to_a
128
127
  while centers.length > 0
129
128
  center = centers.first
130
129
  current = center
@@ -133,7 +132,7 @@ module Bipm
133
132
  break unless current.next
134
133
  while Nokogiri::XML::Text === current.next
135
134
  current = current.next
136
- break if current.text.strip != ''
135
+ break if current.text.strip != ""
137
136
  end
138
137
  break unless current.next
139
138
  break unless current.next.name == "center"
@@ -159,14 +158,14 @@ module Bipm
159
158
  end
160
159
 
161
160
  # Remove the remaining centers
162
- ps.css('center').each do |i|
161
+ ps.css("center").each do |i|
163
162
  i.replace i.inner_html
164
163
  end
165
164
  end
166
165
 
167
- def format_message part
166
+ def format_message(part)
168
167
  AsciiMath.asciidoc_extract_math(
169
- Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", " ").gsub(" \n", "\n")
170
169
  )
171
170
  rescue
172
171
  warn "Bug in Coradoc, couldn't parse the following document:"
@@ -175,36 +174,36 @@ module Bipm
175
174
  raise
176
175
  end
177
176
 
178
- def ng_to_string ps
179
- ps.inner_html.encode('utf-8').gsub("\r", '').gsub(%r'</?nobr>','')
177
+ def ng_to_string(ps)
178
+ ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
180
179
  end
181
180
 
182
- def parse_resolution res, res_id, date, type = :cgpm, lang = 'en', rec_type = nil
181
+ def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
183
182
  # Reparse the document after fixing upstream syntax
184
183
  fixed_body = res.body.gsub("<name=", "<a name=")
185
- fixed_body = fixed_body.force_encoding('utf-8')
186
- fixed_body = fixed_body.gsub('&Eacute;', 'É')
187
- fixed_body = fixed_body.gsub('&#171;&#032;', '« ')
188
- fixed_body = fixed_body.gsub('&#032;&#187;', ' »')
189
- fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, '')
184
+ fixed_body = fixed_body.force_encoding("utf-8")
185
+ fixed_body = fixed_body.gsub("&Eacute;", "É")
186
+ fixed_body = fixed_body.gsub("&#171;&#032;", "« ")
187
+ fixed_body = fixed_body.gsub("&#032;&#187;", " »")
188
+ fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
190
189
  supertitle = $1.strip
191
- fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, '')
190
+ fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
192
191
  title = $1.strip
193
192
  fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
194
193
  fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
195
194
  ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
196
195
 
197
- refs = ng.css('.publication-card_reference a')
196
+ refs = ng.css(".publication-card_reference a")
198
197
 
199
198
  if rec_type.end_with? "?"
200
199
  rec_type = case supertitle
201
- when /\AD[eé]claration/
202
- "declaration"
203
- when /\AR[eé]solution/
204
- "resolution"
205
- else
206
- rec_type[..-2]
207
- end
200
+ when /\AD[eé]claration/
201
+ "statement"
202
+ when /\AR[eé]solution/
203
+ "resolution"
204
+ else
205
+ rec_type[..-2]
206
+ end
208
207
  end
209
208
 
210
209
  r = {
@@ -227,7 +226,7 @@ module Bipm
227
226
  r.delete("type") unless r["type"]
228
227
 
229
228
  if refs.length > 0
230
- r["reference"] = res.uri.merge(refs.first.attr('href')).to_s.split('?').first
229
+ r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
231
230
  name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
232
231
  r["reference_name"] = name
233
232
  if page
@@ -241,7 +240,7 @@ module Bipm
241
240
  r.delete("reference_page")
242
241
  end
243
242
 
244
- ps = ng.css('div.journal-content-article').first
243
+ ps = ng.css("div.journal-content-article").first
245
244
 
246
245
  #binding.pry if ps.count != 1
247
246
 
@@ -255,7 +254,7 @@ module Bipm
255
254
  # doc = AsciiMath.html_to_asciimath(doc)
256
255
 
257
256
  if doc.match? DOIREGEX
258
- doc = doc.sub(DOIREGEX, '')
257
+ doc = doc.sub(DOIREGEX, "")
259
258
  r["doi"] = $1
260
259
  end
261
260
 
@@ -279,13 +278,13 @@ module Bipm
279
278
  next
280
279
  end
281
280
 
282
- if parse.start_with? 'NOTE'
283
- part = part.sub('<h3>NOTE</h3>', '')
281
+ if parse.start_with? "NOTE"
282
+ part = part.sub("<h3>NOTE</h3>", "")
284
283
  r["notes"] = Common.format_message(part)
285
284
  next
286
285
  end
287
286
 
288
- CONSIDERATIONS.any? do |k,v|
287
+ CONSIDERATIONS.any? do |k, v|
289
288
  if parse =~ /\A#{PREFIX}#{k}\b/i
290
289
  r["considerations"] << prev = {
291
290
  "type" => v,
@@ -295,7 +294,7 @@ module Bipm
295
294
  end
296
295
  end && next
297
296
 
298
- ACTIONS.any? do |k,v|
297
+ ACTIONS.any? do |k, v|
299
298
  if parse =~ /\A#{PREFIX}#{k}\b/i
300
299
  r["actions"] << prev = {
301
300
  "type" => v,
@@ -326,13 +325,13 @@ module Bipm
326
325
  end
327
326
 
328
327
  %w[considerations actions].each do |type|
329
- map = type == 'actions' ? ACTIONS : CONSIDERATIONS
328
+ map = type == "actions" ? ACTIONS : CONSIDERATIONS
330
329
  r[type] = r[type].map do |i|
331
330
  islist = false
332
331
 
333
332
  kk = nil
334
333
 
335
- if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
334
+ if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
336
335
  prefix = $2
337
336
  suffix = $3
338
337
  subject = $4
@@ -359,15 +358,15 @@ module Bipm
359
358
 
360
359
  if subject
361
360
  #p subject
362
- r['subject'] ||= []
363
- r['subject'] << subject
361
+ r["subject"] ||= []
362
+ r["subject"] << subject
364
363
  end
365
364
 
366
365
  if islist
367
366
  suffix = suffix.strip
368
- suffix = nil if suffix == ''
367
+ suffix = nil if suffix == ""
369
368
  listitems.map do |li|
370
- i.merge 'message' => [prefix, suffix, li].compact.join(" ")
369
+ i.merge "message" => [prefix, suffix, li].compact.join(" ")
371
370
  end
372
371
  else
373
372
  i
@@ -375,13 +374,13 @@ module Bipm
375
374
  end.flatten
376
375
  end
377
376
 
378
- if r['subject']
379
- r['subject'] = r['subject'].uniq.join(" and ")
377
+ if r["subject"]
378
+ r["subject"] = r["subject"].uniq.join(" and ")
380
379
  end
381
380
 
382
381
  # Note: we replace the previously set r['subject'].
383
- r['subject'] = type.to_s.upcase.gsub("-", ' ')
384
- r['subject'] = 'CCDS' if type == :cctf && supertitle.include?("CCDS")
382
+ r["subject"] = type.to_s.upcase.gsub("-", " ")
383
+ r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
385
384
 
386
385
  r
387
386
  end
@@ -389,11 +388,11 @@ module Bipm
389
388
  def extract_pdf(meeting, lang)
390
389
  pdfs = meeting.css('a.title-third[href*=".pdf"]')
391
390
  .map { |i| i.attr("href") }
392
- .map { |i| i.split('?').first }
391
+ .map { |i| i.split("?").first }
393
392
  .select do |i|
394
- i.downcase.include?("-#{lang}.pdf") ||
395
- %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
396
- end
393
+ i.downcase.include?("-#{lang}.pdf") ||
394
+ %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
395
+ end
397
396
 
398
397
  pdfs = pdfs.first if pdfs.length <= 1
399
398
 
@@ -404,7 +403,7 @@ module Bipm
404
403
  return nil unless date_str
405
404
 
406
405
  date = date_str.strip
407
- .gsub(/\s+/, ' ')
406
+ .gsub(/\s+/, " ")
408
407
  .gsub("février", "february") # 3 first letters must match English
409
408
  .gsub("juin", "june")
410
409
  .gsub("avril", "april")
@@ -425,7 +424,6 @@ module Bipm
425
424
 
426
425
  extend self
427
426
  end
428
-
429
427
  end
430
428
  end
431
429
  end
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.2.0"
6
+ VERSION = "0.2.1"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-12-07 00:00:00.000000000 Z
11
+ date: 2024-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri