bipm-data-importer 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3757c28f685035d3af9283b248cc57d523e9735789fe51753622d72962db84a0
4
- data.tar.gz: 874b08f81363f1d27802e87550be8d73aae99cc5156d0b2f0df92b6b055ee780
3
+ metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
4
+ data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
5
5
  SHA512:
6
- metadata.gz: c394b46052dc8d7dac1f15352176280573ef71c5cb5a72b11e8204499f1f7d7cfb23ac9b09edea208aa7ea3f48eb61f890affb518d3eaf6d79a30879b29fd9a5
7
- data.tar.gz: f0eb33717e70a707e70c491c7c676a5e4b1769aec67b873bdfa2d57317d3c3bc17a7194c8ade7008a58aa8a296e038cddaad9d631490efdeb4bdf61655e9e2e4
6
+ metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
7
+ data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
@@ -1,20 +1,19 @@
1
- require 'mechanize'
2
- require 'coradoc/input/html'
3
- require 'vcr'
4
- require 'date'
5
- require 'fileutils'
6
- require 'pry'
7
- require_relative 'asciimath'
1
+ require "mechanize"
2
+ require "coradoc/input/html"
3
+ require "vcr"
4
+ require "date"
5
+ require "fileutils"
6
+ require "pry"
7
+ require_relative "asciimath"
8
8
 
9
9
  VCR.configure do |c|
10
- c.cassette_library_dir = __dir__+'/../../../../cassettes'
10
+ c.cassette_library_dir = __dir__ + "/../../../../cassettes"
11
11
  c.hook_into :webmock
12
12
  end
13
13
 
14
14
  module Bipm
15
15
  module Data
16
16
  module Importer
17
-
18
17
  CONSIDERATIONS = {
19
18
  /(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
20
19
  /(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
@@ -79,52 +78,52 @@ module Bipm
79
78
  /(?:empowers|habilite)/i => "empowers",
80
79
  }
81
80
 
82
- PREFIX1=/(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
83
- PREFIX2=/The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
84
- PREFIX3=/Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
85
- PREFIX4=/(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
86
- PREFIX5=/(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
87
- PREFIX6=/“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
81
+ PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
82
+ PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
83
+ PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
84
+ PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
85
+ PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
86
+ PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
88
87
 
89
- PREFIX=/(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
88
+ PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
90
89
 
91
- SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
90
+ SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
91
 
93
92
  DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
94
93
 
95
94
  module Common
96
- def replace_links ps, res, lang
97
- ps.css('a[href]').each do |a|
98
- href = a.attr('href')
95
+ def replace_links(ps, res, lang)
96
+ ps.css("a[href]").each do |a|
97
+ href = a.attr("href")
99
98
 
100
- href = href.gsub(%r'\Ahttps://www.bipm.org/', '')
99
+ href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
101
100
 
102
101
  # Correct links
103
- href = href.gsub('/web/guest/', "/#{lang}/")
102
+ href = href.gsub("/web/guest/", "/#{lang}/")
104
103
 
105
104
  # Account for some mistakes from an upstream document
106
- href = href.gsub(%r"\A/jen/", '/en/')
107
- href = href.gsub(%r"\A/en/CGPM/jsp/", '/en/CGPM/db/')
105
+ href = href.gsub(%r"\A/jen/", "/en/")
106
+ href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
108
107
 
109
108
  href = case href
110
- when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
111
- %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
112
- %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
113
- "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
114
- when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
115
- "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
116
- when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
117
- "cipm-decisions:#{$1}/#{$2}#{$3}"
118
- else
119
- URI(res.uri).merge(href).to_s # Relative -> absolute
120
- end
109
+ when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
110
+ %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
111
+ %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
112
+ "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
113
+ when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
114
+ "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
115
+ when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
116
+ "cipm-decisions:#{$1}/#{$2}#{$3}"
117
+ else
118
+ URI(res.uri).merge(href).to_s # Relative -> absolute
119
+ end
121
120
 
122
- a.set_attribute('href', href)
121
+ a.set_attribute("href", href)
123
122
  end
124
123
  end
125
124
 
126
- def replace_centers ps
127
- centers = ps.css('center').to_a
125
+ def replace_centers(ps)
126
+ centers = ps.css("center").to_a
128
127
  while centers.length > 0
129
128
  center = centers.first
130
129
  current = center
@@ -133,7 +132,7 @@ module Bipm
133
132
  break unless current.next
134
133
  while Nokogiri::XML::Text === current.next
135
134
  current = current.next
136
- break if current.text.strip != ''
135
+ break if current.text.strip != ""
137
136
  end
138
137
  break unless current.next
139
138
  break unless current.next.name == "center"
@@ -159,14 +158,14 @@ module Bipm
159
158
  end
160
159
 
161
160
  # Remove the remaining centers
162
- ps.css('center').each do |i|
161
+ ps.css("center").each do |i|
163
162
  i.replace i.inner_html
164
163
  end
165
164
  end
166
165
 
167
- def format_message part
166
+ def format_message(part)
168
167
  AsciiMath.asciidoc_extract_math(
169
- Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", " ").gsub(" \n", "\n")
170
169
  )
171
170
  rescue
172
171
  warn "Bug in Coradoc, couldn't parse the following document:"
@@ -175,36 +174,36 @@ module Bipm
175
174
  raise
176
175
  end
177
176
 
178
- def ng_to_string ps
179
- ps.inner_html.encode('utf-8').gsub("\r", '').gsub(%r'</?nobr>','')
177
+ def ng_to_string(ps)
178
+ ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
180
179
  end
181
180
 
182
- def parse_resolution res, res_id, date, type = :cgpm, lang = 'en', rec_type = nil
181
+ def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
183
182
  # Reparse the document after fixing upstream syntax
184
183
  fixed_body = res.body.gsub("<name=", "<a name=")
185
- fixed_body = fixed_body.force_encoding('utf-8')
186
- fixed_body = fixed_body.gsub('&Eacute;', 'É')
187
- fixed_body = fixed_body.gsub('&#171;&#032;', '« ')
188
- fixed_body = fixed_body.gsub('&#032;&#187;', ' »')
189
- fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, '')
184
+ fixed_body = fixed_body.force_encoding("utf-8")
185
+ fixed_body = fixed_body.gsub("&Eacute;", "É")
186
+ fixed_body = fixed_body.gsub("&#171;&#032;", "« ")
187
+ fixed_body = fixed_body.gsub("&#032;&#187;", " »")
188
+ fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
190
189
  supertitle = $1.strip
191
- fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, '')
190
+ fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
192
191
  title = $1.strip
193
192
  fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
194
193
  fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
195
194
  ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
196
195
 
197
- refs = ng.css('.publication-card_reference a')
196
+ refs = ng.css(".publication-card_reference a")
198
197
 
199
198
  if rec_type.end_with? "?"
200
199
  rec_type = case supertitle
201
- when /\AD[eé]claration/
202
- "declaration"
203
- when /\AR[eé]solution/
204
- "resolution"
205
- else
206
- rec_type[..-2]
207
- end
200
+ when /\AD[eé]claration/
201
+ "statement"
202
+ when /\AR[eé]solution/
203
+ "resolution"
204
+ else
205
+ rec_type[..-2]
206
+ end
208
207
  end
209
208
 
210
209
  r = {
@@ -227,7 +226,7 @@ module Bipm
227
226
  r.delete("type") unless r["type"]
228
227
 
229
228
  if refs.length > 0
230
- r["reference"] = res.uri.merge(refs.first.attr('href')).to_s.split('?').first
229
+ r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
231
230
  name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
232
231
  r["reference_name"] = name
233
232
  if page
@@ -241,7 +240,7 @@ module Bipm
241
240
  r.delete("reference_page")
242
241
  end
243
242
 
244
- ps = ng.css('div.journal-content-article').first
243
+ ps = ng.css("div.journal-content-article").first
245
244
 
246
245
  #binding.pry if ps.count != 1
247
246
 
@@ -255,7 +254,7 @@ module Bipm
255
254
  # doc = AsciiMath.html_to_asciimath(doc)
256
255
 
257
256
  if doc.match? DOIREGEX
258
- doc = doc.sub(DOIREGEX, '')
257
+ doc = doc.sub(DOIREGEX, "")
259
258
  r["doi"] = $1
260
259
  end
261
260
 
@@ -279,13 +278,13 @@ module Bipm
279
278
  next
280
279
  end
281
280
 
282
- if parse.start_with? 'NOTE'
283
- part = part.sub('<h3>NOTE</h3>', '')
281
+ if parse.start_with? "NOTE"
282
+ part = part.sub("<h3>NOTE</h3>", "")
284
283
  r["notes"] = Common.format_message(part)
285
284
  next
286
285
  end
287
286
 
288
- CONSIDERATIONS.any? do |k,v|
287
+ CONSIDERATIONS.any? do |k, v|
289
288
  if parse =~ /\A#{PREFIX}#{k}\b/i
290
289
  r["considerations"] << prev = {
291
290
  "type" => v,
@@ -295,7 +294,7 @@ module Bipm
295
294
  end
296
295
  end && next
297
296
 
298
- ACTIONS.any? do |k,v|
297
+ ACTIONS.any? do |k, v|
299
298
  if parse =~ /\A#{PREFIX}#{k}\b/i
300
299
  r["actions"] << prev = {
301
300
  "type" => v,
@@ -326,13 +325,13 @@ module Bipm
326
325
  end
327
326
 
328
327
  %w[considerations actions].each do |type|
329
- map = type == 'actions' ? ACTIONS : CONSIDERATIONS
328
+ map = type == "actions" ? ACTIONS : CONSIDERATIONS
330
329
  r[type] = r[type].map do |i|
331
330
  islist = false
332
331
 
333
332
  kk = nil
334
333
 
335
- if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
334
+ if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
336
335
  prefix = $2
337
336
  suffix = $3
338
337
  subject = $4
@@ -359,15 +358,15 @@ module Bipm
359
358
 
360
359
  if subject
361
360
  #p subject
362
- r['subject'] ||= []
363
- r['subject'] << subject
361
+ r["subject"] ||= []
362
+ r["subject"] << subject
364
363
  end
365
364
 
366
365
  if islist
367
366
  suffix = suffix.strip
368
- suffix = nil if suffix == ''
367
+ suffix = nil if suffix == ""
369
368
  listitems.map do |li|
370
- i.merge 'message' => [prefix, suffix, li].compact.join(" ")
369
+ i.merge "message" => [prefix, suffix, li].compact.join(" ")
371
370
  end
372
371
  else
373
372
  i
@@ -375,13 +374,13 @@ module Bipm
375
374
  end.flatten
376
375
  end
377
376
 
378
- if r['subject']
379
- r['subject'] = r['subject'].uniq.join(" and ")
377
+ if r["subject"]
378
+ r["subject"] = r["subject"].uniq.join(" and ")
380
379
  end
381
380
 
382
381
  # Note: we replace the previously set r['subject'].
383
- r['subject'] = type.to_s.upcase.gsub("-", ' ')
384
- r['subject'] = 'CCDS' if type == :cctf && supertitle.include?("CCDS")
382
+ r["subject"] = type.to_s.upcase.gsub("-", " ")
383
+ r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
385
384
 
386
385
  r
387
386
  end
@@ -389,11 +388,11 @@ module Bipm
389
388
  def extract_pdf(meeting, lang)
390
389
  pdfs = meeting.css('a.title-third[href*=".pdf"]')
391
390
  .map { |i| i.attr("href") }
392
- .map { |i| i.split('?').first }
391
+ .map { |i| i.split("?").first }
393
392
  .select do |i|
394
- i.downcase.include?("-#{lang}.pdf") ||
395
- %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
396
- end
393
+ i.downcase.include?("-#{lang}.pdf") ||
394
+ %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
395
+ end
397
396
 
398
397
  pdfs = pdfs.first if pdfs.length <= 1
399
398
 
@@ -404,7 +403,7 @@ module Bipm
404
403
  return nil unless date_str
405
404
 
406
405
  date = date_str.strip
407
- .gsub(/\s+/, ' ')
406
+ .gsub(/\s+/, " ")
408
407
  .gsub("février", "february") # 3 first letters must match English
409
408
  .gsub("juin", "june")
410
409
  .gsub("avril", "april")
@@ -425,7 +424,6 @@ module Bipm
425
424
 
426
425
  extend self
427
426
  end
428
-
429
427
  end
430
428
  end
431
429
  end
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.2.0"
6
+ VERSION = "0.2.1"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-12-07 00:00:00.000000000 Z
11
+ date: 2024-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri